diff --git a/KestrelMinima.Demo/KestrelMinima.Demo.csproj b/KestrelMinima.Demo/KestrelMinima.Demo.csproj new file mode 100644 index 0000000..0179cd3 --- /dev/null +++ b/KestrelMinima.Demo/KestrelMinima.Demo.csproj @@ -0,0 +1,16 @@ + + + + net10.0 + enable + enable + true + false + true + + + + + + + diff --git a/KestrelMinima.Demo/Program.cs b/KestrelMinima.Demo/Program.cs new file mode 100644 index 0000000..4620407 --- /dev/null +++ b/KestrelMinima.Demo/Program.cs @@ -0,0 +1,12 @@ +using KestrelMinima; + +var builder = WebApplication.CreateSlimBuilder(args); +builder.Logging.SetMinimumLevel(LogLevel.Warning); + +builder.WebHost + .UseKestrelMinima(o => o.ReactorCount = 8) + .ConfigureKestrel(o => o.ListenAnyIP(8080)); + +var app = builder.Build(); +app.MapGet("/", () => "Hello World!"); +app.Run(); diff --git a/KestrelMinima/Connection/Connection.InputPipe.cs b/KestrelMinima/Connection/Connection.InputPipe.cs new file mode 100644 index 0000000..4b8b2de --- /dev/null +++ b/KestrelMinima/Connection/Connection.InputPipe.cs @@ -0,0 +1,33 @@ +using System.IO.Pipelines; + +namespace KestrelMinima; + +/// +/// Kestrel-mode input path. The reactor copies recv bytes into a BCL +/// and Kestrel reads InputPipe.Reader — bypassing the +/// hand-rolled read IVTS, which can't take Kestrel's concurrent off-reactor +/// access. Output uses the write slab + a fire-and-forget FlushAsync (no IVTS). +/// +public sealed unsafe partial class Connection +{ + internal Pipe? InputPipe; + + internal void InitInputPipe() + => InputPipe = new Pipe(new PipeOptions( + pauseWriterThreshold: 0, + resumeWriterThreshold: 0, + useSynchronizationContext: false)); + + /// Reactor-thread: copy recv bytes into the pipe and publish. + internal void FeedInput(byte* ptr, int len) + { + Span dst = InputPipe!.Writer.GetSpan(len); + new ReadOnlySpan(ptr, len).CopyTo(dst); + InputPipe.Writer.Advance(len); + _ = InputPipe.Writer.FlushAsync(); // no backpressure → completes synchronously + } + + /// Reactor-thread: signal EOF to Kestrel's reader. + internal void CompleteInput(Exception? error = null) + => InputPipe?.Writer.Complete(error); +} diff --git a/KestrelMinima/Connection/Connection.Read.cs b/KestrelMinima/Connection/Connection.Read.cs new file mode 100644 index 0000000..6ade2ab --- /dev/null +++ b/KestrelMinima/Connection/Connection.Read.cs @@ -0,0 +1,163 @@ +using System.Threading.Tasks.Sources; +using KestrelMinima.Utils; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace KestrelMinima; + +/// +/// Per-connection state. The handler may run on any thread (e.g. resumed by +/// a thread-pool timer); reactor-only side effects are funnelled through the +/// MPSC queues on `Reactor`. Coordination uses Interlocked.Exchange on the +/// arm flags and a sticky `_pending` to close the lost-wakeup race. +/// +/// Lifetime is pool-managed: the reactor pops a Connection on accept (or new +/// one if pool is empty), and pushes it back on teardown after `Clear()`. The +/// `_generation` field is bumped on each `Clear` so stale `ValueTask` tokens +/// from a previous connection life are detectable and return `Closed()` +/// instead of leaking the new tenant's state. +/// +public sealed unsafe partial class Connection : IValueTaskSource +{ + internal Connection SetFd(int fd) + { + ClientFd = fd; + return this; + } + + private ManualResetValueTaskSourceCore _readSignal; + private int _armed; + private int _pending; + private int _closed; + + private readonly SpscRecvRing _recv = new(capacityPow2: 16); + + public ValueTask ReadAsync() + { + if (!_recv.IsEmpty() || Volatile.Read(ref _pending) == 1) + { + Volatile.Write(ref _pending, 0); + return new ValueTask( + new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + + if (Volatile.Read(ref _closed) != 0) + { + return new ValueTask(RecvSnapshot.Closed()); + } + + if (Interlocked.Exchange(ref _armed, 1) == 1) + { + throw new InvalidOperationException("ReadAsync already armed."); + } + + // Snapshot the generation as the IVTS token so a future Clear() can + // invalidate this awaiter if the connection gets pool-recycled. + int gen = Volatile.Read(ref _generation); + + // Race recovery: re-check between arming and returning the IVTS task. + if (!_recv.IsEmpty() || Volatile.Read(ref _pending) == 1 || Volatile.Read(ref _closed) != 0) + { + Volatile.Write(ref _pending, 0); + Interlocked.Exchange(ref _armed, 0); + + return new ValueTask( + new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + + return new ValueTask(this, (short)gen); + } + + public bool TryGetItem(in RecvSnapshot snap, out SpscRecvRing.Item item) + => _recv.TryDequeueUntil(snap.Tail, out item); + + public void ResetRead() => _readSignal.Reset(); + + public void Complete(int res, ushort bid, bool hasBuffer, byte* ptr) + { + if (!_recv.TryEnqueue(new SpscRecvRing.Item + { + Ptr = ptr, + Bid = bid, + Len = res, + HasBuffer = hasBuffer, + Gen = (ushort)Volatile.Read(ref _generation) + })) + { + Console.Error.WriteLine("[conn] recv queue overflow."); + if (hasBuffer) + { + _reactor.ReturnBufferDirect(bid); + } + Volatile.Write(ref _closed, 1); + } + + if (Interlocked.Exchange(ref _armed, 0) == 1) + { + _readSignal.SetResult(new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + else + { + Volatile.Write(ref _pending, 1); + } + } + + internal void DrainRecv() + { + // Return any buffer IDs still sitting in the SPSC ring (handler exited + // before draining them, or a recv arrived after _closed was set). + while (_recv.TryDequeue(out SpscRecvRing.Item item)) + { + if (item.HasBuffer) + { + _reactor.ReturnBufferDirect(item.Bid); + } + } + } + + // ========================================================================= + // IValueTaskSource plumbing — token (= snapshot of `_generation` at await + // time) is compared against the current `_generation` to detect stale + // awaiters from before a Clear()/pool reuse. Stale awaiters get a + // sentinel result rather than the new tenant's state. + // + // For the actual IVTS dispatch we pass `_readSignal.Version` / + // `_flushSignal.Version` to the underlying core (not `token`) because the + // core's version is bumped by ResetRead/CompleteFlush mid-life and is + // unrelated to the cross-life generation guard. + // ========================================================================= + + RecvSnapshot IValueTaskSource.GetResult(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return RecvSnapshot.Closed(); + } + + return _readSignal.GetResult(_readSignal.Version); + } + + ValueTaskSourceStatus IValueTaskSource.GetStatus(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return ValueTaskSourceStatus.Succeeded; + } + + return _readSignal.GetStatus(_readSignal.Version); + } + + void IValueTaskSource.OnCompleted(Action continuation, object? state, short token, ValueTaskSourceOnCompletedFlags flags) + { + if (token != (short)Volatile.Read(ref _generation)) + { + // Stale — run the continuation now so the awaiter unblocks and + // gets RecvSnapshot.Closed() from GetResult. + continuation(state); + + return; + } + + _readSignal.OnCompleted(continuation, state, _readSignal.Version, flags); + } +} diff --git a/KestrelMinima/Connection/Connection.Write.cs b/KestrelMinima/Connection/Connection.Write.cs new file mode 100644 index 0000000..8a95d08 --- /dev/null +++ b/KestrelMinima/Connection/Connection.Write.cs @@ -0,0 +1,102 @@ +using System.Buffers; +using KestrelMinima.Utils; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace KestrelMinima; + +/// +/// Fire-and-forget write path: FlushAsync hands the slab to the reactor (an io_uring +/// send SQE + eventfd wake) and returns synchronously. No IValueTaskSource, no +/// awaiter scheduling, no continuation hop. Safe for HTTP/1.1 plaintext because the +/// client cannot send the next request until it receives the previous response — +/// which in turn cannot happen until the kernel finishes our send (and the reactor +/// has processed the resulting send CQE, which is what resets WriteHead/WriteTail). +/// So by the time Kestrel produces the next response into this slab, the previous +/// send is fully ack'd and the slab is free for reuse. +/// +public sealed unsafe partial class Connection : IBufferWriter +{ + private readonly int _writeSlabSize; + internal byte* WriteBuffer; + // WriteHead — bytes ack'd by the kernel (reactor thread mutates). + // WriteSubmitted — bytes queued to the kernel via SubmitSend (reactor thread mutates). + // WriteTail — bytes produced by Kestrel into the slab (Kestrel thread mutates). + internal int WriteHead; + internal int WriteSubmitted; + internal int WriteTail; + + private readonly UnmanagedMemoryManager _manager; + + // IBufferWriter +#region IBufferWriter + + public Memory GetMemory(int sizeHint = 0) + { + int remaining = _writeSlabSize - WriteTail; + if (sizeHint > remaining) + { + throw new InvalidOperationException( + $"GetMemory: sizeHint={sizeHint} > remaining={remaining} (slab={_writeSlabSize}, WriteTail={WriteTail}, WriteSubmitted={WriteSubmitted}, WriteHead={WriteHead}, closed={Volatile.Read(ref _closed)})"); + } + + return _manager.Memory.Slice(WriteTail, remaining); + } + + public Span GetSpan(int sizeHint = 0) + { + if (WriteTail + sizeHint > _writeSlabSize) + { + throw new InvalidOperationException( + $"GetSpan: sizeHint={sizeHint}, WriteTail={WriteTail}, slab={_writeSlabSize}, WriteSubmitted={WriteSubmitted}, WriteHead={WriteHead}, closed={Volatile.Read(ref _closed)}"); + } + + return new Span(WriteBuffer + WriteTail, _writeSlabSize - WriteTail); + } + + public void Advance(int count) => WriteTail += count; + +#endregion + + // Write to the inner buffer + public void Write(ReadOnlySpan source) + { + int len = source.Length; + if (WriteTail + len > _writeSlabSize) + { + throw new InvalidOperationException("Write buffer too small."); + } + + source.CopyTo(new Span(WriteBuffer + WriteTail, len)); + WriteTail += len; + } + + // Fire-and-forget: hand the fd to the reactor and return. The reactor reads + // [WriteSubmitted, WriteTail) on drain and submits an SQE. Multi-flush within + // one response is handled naturally — the MPSC may have the fd queued multiple + // times, but the second drain finds end <= begin and no-ops. + public ValueTask FlushAsync() + { + if (Volatile.Read(ref _closed) == 1) + { + return default; + } + + if (WriteTail == 0) + { + return default; + } + + _reactor.EnqueueFlush(ClientFd); + + return default; + } + + // Reactor-thread: all submitted bytes ack'd AND no new bytes pending — reset. + internal void CompleteFlush() + { + WriteHead = 0; + WriteSubmitted = 0; + WriteTail = 0; + } +} diff --git a/KestrelMinima/Connection/Connection.cs b/KestrelMinima/Connection/Connection.cs new file mode 100644 index 0000000..3b394c1 --- /dev/null +++ b/KestrelMinima/Connection/Connection.cs @@ -0,0 +1,68 @@ +using System.Runtime.InteropServices; +using KestrelMinima.Utils; + +namespace KestrelMinima; + +public sealed unsafe partial class Connection +{ + private readonly Reactor _reactor; + + public int ClientFd { get; private set; } + + // Bumped on Clear(); the low 16 bits are used as the read IVTS token so + // stale awaiters can be detected after pool reuse. (The Kestrel path never + // touches the read IVTS, but it's reused by MarkClosed's `_readSignal` + // SetResult — harmless when nobody awaits.) + private int _generation; + + public Connection(Reactor reactor, int fd, int writeSlabSize = 256 * 1024) + { + _reactor = reactor; + ClientFd = fd; + _writeSlabSize = writeSlabSize; + WriteBuffer = (byte*)NativeMemory.AlignedAlloc((nuint)writeSlabSize, 64); + + _manager = new UnmanagedMemoryManager(WriteBuffer, writeSlabSize); + } + + // Reactor-thread only — called from Recycle in the reactor's recv/send error paths. + public void MarkClosed() + { + Volatile.Write(ref _closed, 1); + + if (Interlocked.Exchange(ref _armed, 0) == 1) + { + _readSignal.SetResult(new RecvSnapshot(_recv.SnapshotTail(), isClosed: true)); + } + else + { + Volatile.Write(ref _pending, 1); + } + } + + internal void Clear() + { + Interlocked.Increment(ref _generation); + + Volatile.Write(ref _armed, 0); + Volatile.Write(ref _pending, 0); + Volatile.Write(ref _closed, 0); + + WriteHead = 0; + WriteSubmitted = 0; + WriteTail = 0; + + _readSignal.Reset(); + + _recv.Reset(); // discard any leftover SPSC items + } + + public void Dispose() + { + if (WriteBuffer != null) + { + NativeMemory.AlignedFree(WriteBuffer); + WriteBuffer = null; + } + } +} diff --git a/KestrelMinima/Connection/ConnectionDualPipe.cs b/KestrelMinima/Connection/ConnectionDualPipe.cs new file mode 100644 index 0000000..62b518c --- /dev/null +++ b/KestrelMinima/Connection/ConnectionDualPipe.cs @@ -0,0 +1,17 @@ +using System.IO.Pipelines; + +namespace KestrelMinima; + +public sealed class ConnectionDualPipe : IDuplexPipe +{ + public PipeReader Input { get; } + public PipeWriter Output { get; } + + public ConnectionDualPipe(Connection connection) + { + ArgumentNullException.ThrowIfNull(connection); + // Kestrel mode only — InitInputPipe is always called on accept. + Input = connection.InputPipe!.Reader; + Output = new ConnectionPipeWriter(connection); + } +} \ No newline at end of file diff --git a/KestrelMinima/Connection/ConnectionPipeWriter.cs b/KestrelMinima/Connection/ConnectionPipeWriter.cs new file mode 100644 index 0000000..120db4a --- /dev/null +++ b/KestrelMinima/Connection/ConnectionPipeWriter.cs @@ -0,0 +1,63 @@ +using System.IO.Pipelines; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace KestrelMinima; + +/// +/// Adapts Minima's write API (GetMemory/GetSpan/Advance/ +/// FlushAsync) to a standard , so PipeWriter-based code +/// can write responses through the connection's per-connection slab. +/// A thin wrapper — all the work lives in Connection. +/// +public sealed class ConnectionPipeWriter : PipeWriter +{ + private readonly Connection _conn; + private bool _completed; + private bool _cancelRequested; + private long _unflushed; + + public ConnectionPipeWriter(Connection connection) + { + _conn = connection ?? throw new ArgumentNullException(nameof(connection)); + } + + public override bool CanGetUnflushedBytes => true; + public override long UnflushedBytes => _unflushed; + + public override Memory GetMemory(int sizeHint = 0) => _conn.GetMemory(sizeHint); + + public override Span GetSpan(int sizeHint = 0) => _conn.GetSpan(sizeHint); + + public override void Advance(int bytes) + { + _unflushed += bytes; + _conn.Advance(bytes); + } + + public override ValueTask FlushAsync(CancellationToken cancellationToken = default) + { + if (_cancelRequested) + { + _cancelRequested = false; + return new ValueTask(new FlushResult(isCanceled: true, isCompleted: _completed)); + } + + _unflushed = 0; + ValueTask inner = _conn.FlushAsync(); + + if (inner.IsCompletedSuccessfully) + return new ValueTask(new FlushResult(isCanceled: false, isCompleted: _completed)); + + return AwaitFlush(inner); + } + + private async ValueTask AwaitFlush(ValueTask inner) + { + await inner; + return new FlushResult(isCanceled: false, isCompleted: _completed); + } + + public override void CancelPendingFlush() => _cancelRequested = true; + + public override void Complete(Exception? exception = null) => _completed = true; +} diff --git a/KestrelMinima/Connection/RecvSnapshot.cs b/KestrelMinima/Connection/RecvSnapshot.cs new file mode 100644 index 0000000..809a4b1 --- /dev/null +++ b/KestrelMinima/Connection/RecvSnapshot.cs @@ -0,0 +1,15 @@ +namespace KestrelMinima; + +public readonly struct RecvSnapshot +{ + public readonly long Tail; + public readonly bool IsClosed; + + public RecvSnapshot(long tail, bool isClosed) + { + Tail = tail; + IsClosed = isClosed; + } + + public static RecvSnapshot Closed() => new(0, isClosed: true); +} \ No newline at end of file diff --git a/KestrelMinima/KestrelMinima.csproj b/KestrelMinima/KestrelMinima.csproj new file mode 100644 index 0000000..61a0661 --- /dev/null +++ b/KestrelMinima/KestrelMinima.csproj @@ -0,0 +1,21 @@ + + + + net10.0 + enable + disable + true + + KestrelMinima + 0.1.0 + Diogo Martins + Minima's io_uring engine as a Kestrel transport. Per-core reactor drives accept and recv (multishot + provided buffers). Writes are fire-and-forget: FlushAsync enqueues an io_uring send SQE and returns synchronously — no IValueTaskSource, no awaiter scheduling. Safe for HTTP/1.1 plaintext because clients send the next request only after receiving the response. Self-contained — no external dependencies. + io_uring;kestrel;aspnetcore;transport;linux;networking;performance + MIT + + + + + + + diff --git a/KestrelMinima/KestrelMinimaEngine.cs b/KestrelMinima/KestrelMinimaEngine.cs new file mode 100644 index 0000000..ffda90c --- /dev/null +++ b/KestrelMinima/KestrelMinimaEngine.cs @@ -0,0 +1,45 @@ +namespace KestrelMinima; + +/// +/// Owns N io_uring reactors (each its own SO_REUSEPORT listener) and funnels +/// accepted connections to the Kestrel transport via a channel. +/// +public sealed class KestrelMinimaEngine +{ + private readonly Reactor[] _reactors; + private readonly Channel _accepted = + Channel.CreateUnbounded(new UnboundedChannelOptions + { + SingleReader = false, + SingleWriter = false, + }); + + public KestrelMinimaEngine(ServerConfig config) + { + _reactors = new Reactor[config.ReactorCount]; + for (int i = 0; i < config.ReactorCount; i++) + { + _reactors[i] = new Reactor(i, config) { OnAccept = OnReactorAccept }; + } + } + + private void OnReactorAccept(Connection conn) => _accepted.Writer.TryWrite(conn); + + public void Start() + { + for (int i = 0; i < _reactors.Length; i++) + { + int idx = i; + var t = new Thread(() => _reactors[idx].Run()) + { + IsBackground = true, + Name = $"kestrel-minima-r{idx}", + }; + t.Start(); + } + } + + public ValueTask AcceptAsync(CancellationToken ct) => _accepted.Reader.ReadAsync(ct); + + public void Stop() => _accepted.Writer.TryComplete(); +} diff --git a/KestrelMinima/KestrelMinimaKestrel.cs b/KestrelMinima/KestrelMinimaKestrel.cs new file mode 100644 index 0000000..67b7d60 --- /dev/null +++ b/KestrelMinima/KestrelMinimaKestrel.cs @@ -0,0 +1,149 @@ +using System.Net; +using Microsoft.AspNetCore.Connections; +using Microsoft.AspNetCore.Connections.Features; +using Microsoft.AspNetCore.Hosting; +using Microsoft.AspNetCore.Http.Features; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; + +namespace KestrelMinima; + +internal sealed class KestrelMinimaConnectionContext : ConnectionContext, + IConnectionIdFeature, IConnectionTransportFeature, IConnectionItemsFeature, + IConnectionLifetimeFeature, IConnectionEndPointFeature +{ + private static long s_id; + + private readonly Connection _conn; + private readonly ConnectionDualPipe _pipe; + private readonly CancellationTokenSource _closedCts = new(); + private readonly FeatureCollection _features = new(); + private bool _disposed; + + public KestrelMinimaConnectionContext(Connection conn, EndPoint? localEndPoint) + { + _conn = conn; + _pipe = new ConnectionDualPipe(conn); + + ConnectionId = $"kestrel-minima-{Interlocked.Increment(ref s_id):x}"; + LocalEndPoint = localEndPoint; + Items = new ConnectionItems(); + ConnectionClosed = _closedCts.Token; + + _features.Set(this); + _features.Set(this); + _features.Set(this); + _features.Set(this); + _features.Set(this); + } + + public override string ConnectionId { get; set; } + public override IFeatureCollection Features => _features; + public override IDictionary Items { get; set; } + public override IDuplexPipe Transport + { + get => _pipe; + set => throw new NotSupportedException("Transport is owned by the KestrelMinima transport."); + } + public override CancellationToken ConnectionClosed { get; set; } + public override EndPoint? LocalEndPoint { get; set; } + public override EndPoint? RemoteEndPoint { get; set; } + + public override void Abort(ConnectionAbortedException abortReason) + { + try { _closedCts.Cancel(); } catch { } + try { _pipe.Input.Complete(abortReason); } catch { } + try { _pipe.Output.Complete(abortReason); } catch { } + } + + public override ValueTask DisposeAsync() + { + if (_disposed) return ValueTask.CompletedTask; + _disposed = true; + try { _closedCts.Cancel(); } catch { } + try { _pipe.Input.Complete(); } catch { } + try { _pipe.Output.Complete(); } catch { } + _closedCts.Dispose(); + return ValueTask.CompletedTask; + } +} + +internal sealed class KestrelMinimaConnectionListener : IConnectionListener +{ + private readonly KestrelMinimaEngine _engine; + + public KestrelMinimaConnectionListener(KestrelMinimaEngine engine, EndPoint endpoint) + { + _engine = engine; + EndPoint = endpoint; + } + + public EndPoint EndPoint { get; } + + public async ValueTask AcceptAsync(CancellationToken cancellationToken = default) + { + try + { + Connection conn = await _engine.AcceptAsync(cancellationToken).ConfigureAwait(false); + return new KestrelMinimaConnectionContext(conn, EndPoint); + } + catch (OperationCanceledException) { return null; } + catch (ChannelClosedException) { return null; } + } + + public ValueTask UnbindAsync(CancellationToken cancellationToken = default) { _engine.Stop(); return ValueTask.CompletedTask; } + public ValueTask DisposeAsync() { _engine.Stop(); return ValueTask.CompletedTask; } +} + +public sealed class KestrelMinimaTransportOptions +{ + public int ReactorCount { get; set; } = Math.Max(1, Environment.ProcessorCount); +} + +public sealed class KestrelMinimaTransportFactory : IConnectionListenerFactory +{ + private readonly KestrelMinimaTransportOptions _options; + private readonly ILogger _logger; + + public KestrelMinimaTransportFactory(IOptions options, ILoggerFactory loggerFactory) + { + _options = options.Value; + _logger = loggerFactory.CreateLogger(); + } + + public ValueTask BindAsync(EndPoint endpoint, CancellationToken cancellationToken = default) + { + if (endpoint is not IPEndPoint ip) + { + throw new NotSupportedException( + $"KestrelMinima only supports {nameof(IPEndPoint)} (got {endpoint.GetType().Name})."); + } + + var config = new ServerConfig { Port = (ushort)ip.Port, ReactorCount = _options.ReactorCount, Incremental = false }; + var engine = new KestrelMinimaEngine(config); + engine.Start(); + _logger.LogInformation("[kestrel-minima] Bound :{Port} with {ReactorCount} io_uring reactor(s) (fire-and-forget send)", ip.Port, _options.ReactorCount); + + IConnectionListener listener = new KestrelMinimaConnectionListener(engine, ip); + return ValueTask.FromResult(listener); + } +} + +public static class KestrelMinimaKestrelExtensions +{ + /// + /// Replace Kestrel's socket transport with KestrelMinima: a per-core io_uring reactor for + /// accept/recv and a fire-and-forget io_uring send (FlushAsync enqueues an SQE and returns + /// synchronously — no IValueTaskSource awaiter scheduling). Linux only, HTTP/1.1 plaintext. + /// + public static IWebHostBuilder UseKestrelMinima(this IWebHostBuilder builder, Action? configure = null) + { + builder.ConfigureServices(services => + { + if (configure is not null) services.Configure(configure); + services.AddSingleton(); + }); + return builder; + } +} diff --git a/KestrelMinima/Reactor/Reactor.cs b/KestrelMinima/Reactor/Reactor.cs new file mode 100644 index 0000000..9f74037 --- /dev/null +++ b/KestrelMinima/Reactor/Reactor.cs @@ -0,0 +1,510 @@ +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using KestrelMinima.Utils; +using static KestrelMinima.Native; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace KestrelMinima; + +/// +/// One reactor = one thread + one io_uring + one listening socket (SO_REUSEPORT) +/// + one connection map. The reactor thread is the sole writer of the SQ ring, +/// the kernel-shared buf_ring, and the connection map. Handlers may run on any +/// thread (e.g. resumed by a thread-pool timer after `await Task.Delay(1)`); +/// they reach the reactor only through two MPSC queues (`_returnQ`, `_flushQ`) +/// woken by an `eventfd` registered as a multishot poll in the ring. +/// +public sealed unsafe partial class Reactor +{ + public readonly int Id; + public Ring Ring = null!; // created on the reactor's own thread (DEFER_TASKRUN requires same-thread setup+enter) + public readonly Dictionary Connections = new(); + + /// Set by the Kestrel transport: vend each accepted connection instead of running an inline handler. + public Action? OnAccept; + + private int _listenFd; + private readonly ServerConfig _config; + private readonly ushort _port; + private readonly uint _ringEntries; + private readonly uint RecvBufferSize; + + // CQE user_data layout: kind tag in the high 32 bits, fd in the low 32. + private const ulong KindAccept = 1UL << 32; + private const ulong KindRecv = 2UL << 32; + private const ulong KindSend = 3UL << 32; + private const ulong KindWake = 4UL << 32; // eventfd-based cross-thread wake + + // Provided-buffer ring (one per reactor, shared by all its connections). + private const ushort BgId = 1; + private readonly uint BufferRingEntries; // power of two + private byte* _bufRing; // io_uring_buf_ring (kernel-shared) + private byte* _bufSlab; // contiguous slab of recv buffers + private uint _bufRingMask; + private ushort _bufRingTail; + + // Cross-thread wake mechanism: handlers running off-reactor enqueue work + // into these MPSC queues and `eventfd_write` _wakeFd; a multishot poll on + // _wakeFd registered with the ring delivers a CQE that wakes the reactor. + // When the caller is already the reactor thread (the common case — handler + // resumed inline from an IVTS SetResult), the Enqueue* methods bypass + // the queue and call the direct op, avoiding 2 syscalls per request. + private int _wakeFd; + private int _reactorThreadId; + private readonly Mpsc _returnQ = new(1 << 14); // 16384 slots + private readonly Mpsc _flushQ = new(1 << 12); // 4096 slots (fire-and-forget hand-off from off-reactor FlushAsync) + + // Connection pool. Reactor-thread-only — accept and teardown both run on + // this reactor, so a plain Stack is sufficient (no MPMC primitive + // needed). PoolMax caps the slab footprint per reactor: + // PoolMax × WriteSlabSize × ReactorCount = total reserved native memory. + private readonly int PoolMax; + private readonly Stack _pool; + + // Transient io_uring_enter errnos (Linux): interrupted, would-block, busy. + private const int EINTR = 4; + private const int EAGAIN = 11; + private const int EBUSY = 16; + + public Reactor(int id, ServerConfig config) + { + Id = id; + _config = config; + _port = config.Port; + _ringEntries = config.RingEntries; + RecvBufferSize = (uint)config.RecvBufferSize; + BufferRingEntries = (uint)config.BufferRingEntries; + PoolMax = config.PoolMax; + _pool = new Stack(config.PoolMax); + } + + // ========================================================================= + // Buffer ring + // ========================================================================= + + private void InitBufferRing() + { + nuint ringBytes = (nuint)BufferRingEntries * 16; + _bufRing = (byte*)NativeMemory.AlignedAlloc(ringBytes, 4096); + NativeMemory.Clear(_bufRing, ringBytes); + + nuint slabBytes = BufferRingEntries * (nuint)RecvBufferSize; + _bufSlab = (byte*)NativeMemory.AlignedAlloc(slabBytes, 64); + + _bufRingMask = BufferRingEntries - 1; + + var reg = new io_uring_buf_reg { + ring_addr = (ulong)_bufRing, + ring_entries = BufferRingEntries, + bgid = BgId, + }; + + int ret = io_uring_register(Ring.Fd, IORING_REGISTER_PBUF_RING, ®, 1); + if (ret < 0) + { + int err = Marshal.GetLastPInvokeError(); + + throw new InvalidOperationException($"register pbuf_ring failed: ret={ret} errno={err}"); + } + + // Populate every slot once. Slot 0 overlaps with the ring's tail field + // at offset 14, but we only write addr/len/bid (offsets 0..13) so tail + // stays at zero until we set it explicitly. + for (ushort bid = 0; bid < BufferRingEntries; bid++) { + byte* slot = _bufRing + (uint)bid * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)RecvBufferSize); + *(uint*)(slot + 8) = RecvBufferSize; + *(ushort*)(slot + 12) = bid; + } + _bufRingTail = (ushort)BufferRingEntries; + + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + + // Reactor-thread-only: writes the kernel-shared buf_ring tail directly. + // Off-reactor callers must use EnqueueReturnQ instead. + internal void ReturnBufferDirect(ushort bid) + { + byte* slot = _bufRing + (_bufRingTail & _bufRingMask) * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)RecvBufferSize); + *(uint*)(slot + 8) = RecvBufferSize; + *(ushort*)(slot + 12) = bid; + _bufRingTail++; + + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + + // ========================================================================= + // Cross-thread entry points (safe to call from any thread) + // ========================================================================= + + public void EnqueueReturnQ(ushort bid) + { + // Fast path: caller is the reactor thread (handler running inline from + // an IVTS SetResult). Go straight to the buf_ring — no queue, no syscall. + if (Environment.CurrentManagedThreadId == _reactorThreadId) + { + ReturnBufferDirect(bid); + return; + } + SpinWait sw = default; + while (!_returnQ.TryEnqueue(bid)) + { + sw.SpinOnce(); + } + WakeFdWrite(); + } + + internal void EnqueueFlush(int fd) + { + // Fast path: caller is the reactor thread; submit the SQE for the new + // bytes [WriteSubmitted, WriteTail) directly. + if (Environment.CurrentManagedThreadId == _reactorThreadId) + { + if (Connections.TryGetValue(fd, out var conn)) + { + SubmitSendRange(fd, conn); + } + return; + } + SpinWait sw = default; + while (!_flushQ.TryEnqueue(fd)) + { + sw.SpinOnce(); + } + WakeFdWrite(); + } + + private void WakeFdWrite() + { + ulong v = 1; + // 8-byte write to eventfd increments its counter; the kernel marks the + // fd readable, which fires our registered multishot poll's next CQE. + write(_wakeFd, &v, 8); + } + + private void DrainReturnQ() + { + while (_returnQ.TryDequeue(out ushort bid)) + { + ReturnBufferDirect(bid); + } + } + + private void DrainFlushQ() + { + while (_flushQ.TryDequeue(out int fd)) + { + if (!Connections.TryGetValue(fd, out var conn)) + { + continue; + } + // Mpsc Enqueue/Dequeue establishes happens-before, so WriteTail is + // visible here. + SubmitSendRange(fd, conn); + } + } + + // Submit a send for the new bytes the connection has produced since the last + // SubmitSend. Reactor-thread only — mutates WriteSubmitted. + private void SubmitSendRange(int fd, Connection conn) + { + int begin = conn.WriteSubmitted; + int end = conn.WriteTail; + if (end <= begin) + { + return; + } + SubmitSend(fd, conn.WriteBuffer + begin, (uint)(end - begin)); + conn.WriteSubmitted = end; + } + + private void ArmWakePoll() + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_POLL_ADD; + sqe->fd = _wakeFd; + sqe->op_flags = POLLIN; // poll32_events lives at this offset + sqe->len = IORING_POLL_ADD_MULTI; // multishot — stays armed across CQEs + sqe->user_data = KindWake | (uint)_wakeFd; + } + + // ========================================================================= + // Main loop + // ========================================================================= + + public void Run() + { + _reactorThreadId = Environment.CurrentManagedThreadId; + + Ring = Ring.Create(_ringEntries); + _listenFd = OpenReusePortListener(_port); + + InitBufferRing(); + + _wakeFd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (_wakeFd < 0) + { + throw new InvalidOperationException("eventfd failed"); + } + + Console.WriteLine($"[kestrel-minima r{Id}] listening on 0.0.0.0:{_port}"); + SubmitAcceptMultishot(); + ArmWakePoll(); + + LoopShared(); + + close(_listenFd); + close(_wakeFd); + Ring.Dispose(); + } + + private void LoopShared() + { + while (true) + { + // Drain MPSC queues from off-reactor handlers. Cheap when empty. + DrainReturnQ(); + DrainFlushQ(); + + int rc = Ring.SubmitAndWait(1); + if (rc < 0 && rc != -EINTR && rc != -EAGAIN && rc != -EBUSY) + { + Console.Error.WriteLine($"[r{Id}] io_uring_enter failed: {rc}"); + break; + } + + uint ready = Ring.CqReady(); + for (uint i = 0; i < ready; i++) + { + Dispatch(in Ring.CqeAt(i)); + } + Ring.CqAdvance(ready); + } + } + + private void Dispatch(in IoUringCqe cqe) + { + ulong kind = cqe.user_data & 0xffffffff_00000000UL; + int fd = (int)(cqe.user_data & 0xffffffffUL); + bool more = (cqe.flags & IORING_CQE_F_MORE) != 0; + + if (kind == KindWake) + { + // Drain the eventfd counter so the next write re-triggers POLLIN + // (multishot poll is edge-triggered on the user_space side). + ulong drain; + read(_wakeFd, &drain, 8); + // The actual queue drains happen at the top of the next loop + // iteration — nothing else to do here. + if (!more) + { + ArmWakePoll(); + } + return; + } + + if (kind == KindAccept) + { + if (cqe.res >= 0) + { + int clientFd = cqe.res; + SetNoDelay(clientFd); + Connection conn = _pool.TryPop(out var pooled) + ? pooled.SetFd(clientFd) + : new Connection(this, clientFd, _config.WriteSlabSize); + Connections[clientFd] = conn; + conn.InitInputPipe(); // recv lands in a BCL pipe Kestrel reads + SubmitRecvMultishot(clientFd); + OnAccept?.Invoke(conn); // vend to the Kestrel transport + } + else + { + Console.Error.WriteLine($"[r{Id}] accept error: {cqe.res}"); + } + // Multishot accept stays armed; only re-arm if the kernel terminated it. + if (!more) + { + SubmitAcceptMultishot(); + } + } + else if (kind == KindRecv) + { + bool hasBuf = (cqe.flags & IORING_CQE_F_BUFFER) != 0; + ushort bid = hasBuf ? (ushort)(cqe.flags >> IORING_CQE_BUFFER_SHIFT) : (ushort)0; + + if (cqe.res <= 0) + { + // Peer EOF or recv error — reactor owns teardown. + if (hasBuf) + { + ReturnBufferDirect(bid); + } + if (Connections.Remove(fd, out var dyingConn)) + { + Recycle(dyingConn, fd); + } + return; + } + + if (!Connections.TryGetValue(fd, out var conn)) + { + // Straggler buffer for an already-closed connection. + if (hasBuf) + { + ReturnBufferDirect(bid); + } + return; + } + + // Kestrel: copy recv bytes into the BCL pipe, return the buffer. + if (hasBuf) + { + conn.FeedInput(_bufSlab + (nuint)bid * (nuint)RecvBufferSize, cqe.res); + ReturnBufferDirect(bid); + } + + if (!more) + { + SubmitRecvMultishot(fd); + } + } + else if (kind == KindSend) + { + if (!Connections.TryGetValue(fd, out var conn)) + { + return; + } + if (cqe.res <= 0) + { + // Send error — reactor owns teardown. + Connections.Remove(fd); + Recycle(conn, fd); + return; + } + conn.WriteHead += cqe.res; + if (conn.WriteHead < conn.WriteSubmitted) + { + // Partial send: resubmit the remainder of the most recent batch. + SubmitSend(fd, conn.WriteBuffer + conn.WriteHead, (uint)(conn.WriteSubmitted - conn.WriteHead)); + return; + } + // All submitted bytes ack'd. Reset only when Kestrel has produced + // nothing further; otherwise the next DrainFlushQ pass picks up the + // pending bytes. + if (conn.WriteHead == conn.WriteTail) + { + conn.CompleteFlush(); + } + } + } + + // ========================================================================= + // SQE producers (reactor-thread-only — Connection.FlushAsync hands off via + // EnqueueFlush, which DrainFlushQ turns into SubmitSend on this thread) + // ========================================================================= + + private IoUringSqe* GetSqeOrFlush() + { + IoUringSqe* sqe = Ring.GetSqe(); + if (sqe != null) + { + return sqe; + } + + Ring.SubmitAndWait(0); + sqe = Ring.GetSqe(); + + if (sqe == null) + { + throw new InvalidOperationException("SQ full after flush"); + } + + return sqe; + } + + private void SubmitAcceptMultishot() + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_ACCEPT; + sqe->ioprio = IORING_ACCEPT_MULTISHOT; + sqe->fd = _listenFd; + sqe->user_data = KindAccept | (uint)_listenFd; + } + + private void SubmitRecvMultishot(int fd) => SubmitRecvMultishot(fd, BgId); + + private void SubmitRecvMultishot(int fd, ushort bgid) + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_RECV; + sqe->flags = IOSQE_BUFFER_SELECT; + sqe->ioprio = IORING_RECV_MULTISHOT; + sqe->fd = fd; + sqe->buf_index = bgid; // buffer-group id (shared BgId, or per-conn in incremental) + sqe->user_data = KindRecv | (uint)fd; + } + + private void SubmitSend(int fd, byte* buf, uint len) + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_SEND; + sqe->fd = fd; + sqe->addr = (ulong)buf; + sqe->len = len; + sqe->user_data = KindSend | (uint)fd; + } + + private void Recycle(Connection conn, int fd) + { + // Kestrel transport: the connection is owned by the off-reactor consumer. + // Signal EOF to its input pipe, close the fd, and drop it. Do NOT Clear/pool — + // a recycled connection would race the consumer. + conn.CompleteInput(); + conn.MarkClosed(); + close(fd); + } + + // Disable Nagle on an accepted connection. Must be set per-accepted-socket, + // not on the listener — TCP_NODELAY doesn't reliably inherit across accept, + // which is why zerg/terraform/rtr all set it on the client fd, not the listener. + private static void SetNoDelay(int fd) + { + int one = 1; + setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(int)); + } + + private static int OpenReusePortListener(ushort port) + { + int fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) + { + throw new InvalidOperationException($"socket failed: {fd}"); + } + + int one = 1; + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(int)); + setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(int)); + + sockaddr_in addr = default; + addr.sin_family = AF_INET; + addr.sin_port = Htons(port); + addr.sin_addr.s_addr = 0; // 0.0.0.0 + + if (bind(fd, &addr, (uint)sizeof(sockaddr_in)) < 0) + { + throw new InvalidOperationException("bind failed"); + } + + if (listen(fd, 128) < 0) + { + throw new InvalidOperationException("listen failed"); + } + + return fd; + } +} diff --git a/KestrelMinima/ServerConfig.cs b/KestrelMinima/ServerConfig.cs new file mode 100644 index 0000000..561b040 --- /dev/null +++ b/KestrelMinima/ServerConfig.cs @@ -0,0 +1,35 @@ +namespace KestrelMinima; + +/// +/// All server tunables in one place — replaces the consts that used to be +/// scattered across Program.cs and Reactor.cs. Defaults match the previous +/// hardcoded values; override via object initializer in Main, e.g.: +/// new ServerConfig { Port = 9000, ReactorCount = 8, Incremental = true }. +/// +public sealed record ServerConfig +{ + // Server-level. + public ushort Port { get; init; } = 8080; + public int ReactorCount { get; init; } = 12; + + // Handler style: false = raw ReadAsync/TryGetItem loop; true = PipeReader/PipeWriter. + public bool UsePipe { get; init; } = false; + + // io_uring SQ/CQ depth. + public uint RingEntries { get; init; } = 8192; + + // Shared buffer ring (used when Incremental == false). + public int RecvBufferSize { get; init; } = 32 * 1024; + public int BufferRingEntries { get; init; } = 4096; + + // Per-connection write slab + connection pool cap. + public int WriteSlabSize { get; init; } = 256 * 1024; + public int PoolMax { get; init; } = 1024; + + // Incremental mode (IOU_PBUF_RING_INC) — per-connection rings. + // reserved native memory ≈ PoolMax × ConnBufRingEntries × IncRecvBufferSize × ReactorCount. + public bool Incremental { get; init; } = false; + public int MaxConnections { get; init; } = 4096; // GID cap (one bgid per active connection) + public int ConnBufRingEntries { get; init; } = 16; // buffers per connection ring + public int IncRecvBufferSize { get; init; } = 4096; // bytes per buffer (filled incrementally) +} diff --git a/KestrelMinima/Utils/Mpsc.cs b/KestrelMinima/Utils/Mpsc.cs new file mode 100644 index 0000000..3724976 --- /dev/null +++ b/KestrelMinima/Utils/Mpsc.cs @@ -0,0 +1,115 @@ +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace KestrelMinima.Utils; + +/// +/// Bounded lock-free multi-producer / single-consumer queue. +/// +/// Dmitry Vyukov's bounded MPMC algorithm, specialised to one consumer. +/// Power-of-two capacity, zero-allocation after construction. Producers claim a +/// slot via CAS on the enqueue position (a failed TryEnqueue on a full queue +/// leaves the position untouched — no burned tickets); the single consumer +/// advances the dequeue position with a plain write. Each slot carries a +/// sequence number that coordinates ownership between producers and consumer. +/// +/// One generic queue serves every reactor handoff: Mpsc<ushort> for buffer +/// returns, Mpsc<int> for flush fds, Mpsc<ulong> for packed incremental +/// returns. T is unmanaged so each Cell is a blittable value type with no GC refs. +/// +internal sealed class Mpsc where T : unmanaged +{ + private struct Cell + { + public long Sequence; + public T Value; + } + + private readonly Cell[] _buffer; + private readonly int _mask; + + // PaddedLong is a top-level struct (not nested here) because the CLR forbids + // explicit layout on a type nested inside a generic. + private PaddedLong _enqueuePos; + private PaddedLong _dequeuePos; + + public Mpsc(int capacityPow2) + { + if (capacityPow2 < 2 || (capacityPow2 & (capacityPow2 - 1)) != 0) + throw new ArgumentException("Capacity must be a power of two >= 2.", nameof(capacityPow2)); + + _buffer = new Cell[capacityPow2]; + _mask = capacityPow2 - 1; + + for (int i = 0; i < capacityPow2; i++) + _buffer[i].Sequence = i; + } + + /// Multi-producer safe. Returns false if the queue is full. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryEnqueue(T item) + { + Cell[] buffer = _buffer; + int mask = _mask; + + while (true) + { + long pos = Volatile.Read(ref _enqueuePos.Value); + ref Cell cell = ref buffer[(int)pos & mask]; + + long seq = Volatile.Read(ref cell.Sequence); + long dif = seq - pos; + + if (dif == 0) + { + if (Interlocked.CompareExchange(ref _enqueuePos.Value, pos + 1, pos) == pos) + { + cell.Value = item; + Volatile.Write(ref cell.Sequence, pos + 1); + return true; + } + continue; // lost the race; reload and retry + } + + if (dif < 0) + return false; // slot not yet consumed → full + } + } + + /// Single-consumer only. Returns false if empty. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeue(out T item) + { + Cell[] buffer = _buffer; + int mask = _mask; + + long pos = _dequeuePos.Value; // single consumer: plain read + ref Cell cell = ref buffer[(int)pos & mask]; + + long seq = Volatile.Read(ref cell.Sequence); + long dif = seq - (pos + 1); + + if (dif == 0) + { + item = cell.Value; + _dequeuePos.Value = pos + 1; // single consumer: plain write + Volatile.Write(ref cell.Sequence, pos + mask + 1); // free slot for producers + return true; + } + + item = default; + return false; + } +} + +/// +/// A single long padded to a 64-byte cache line so the producer and consumer +/// positions never share a line (no false sharing). Top-level and non-generic +/// so it can legally use explicit layout. +/// +[StructLayout(LayoutKind.Explicit, Size = 64)] +internal struct PaddedLong +{ + [FieldOffset(0)] public long Value; +} diff --git a/KestrelMinima/Utils/RingSegment.cs b/KestrelMinima/Utils/RingSegment.cs new file mode 100644 index 0000000..2802b7a --- /dev/null +++ b/KestrelMinima/Utils/RingSegment.cs @@ -0,0 +1,31 @@ +using System.Buffers; + +namespace KestrelMinima.Utils; + +/// +/// One segment of a multi-buffer ReadOnlySequence<byte> built by the +/// ConnectionPipeReader when a single read spans more than one recv buffer. +/// BufferId is carried for debugging; buffer return is driven off the held +/// item list, not the segments. +/// +public sealed class RingSegment : ReadOnlySequenceSegment +{ + public ushort BufferId { get; } + + public RingSegment(ReadOnlyMemory memory, ushort bufferId) + { + Memory = memory; + BufferId = bufferId; + } + + public RingSegment Append(ReadOnlyMemory memory, ushort bufferId) + { + var next = new RingSegment(memory, bufferId) + { + RunningIndex = RunningIndex + Memory.Length + }; + + Next = next; + return next; + } +} diff --git a/KestrelMinima/Utils/SpscRecvRing.cs b/KestrelMinima/Utils/SpscRecvRing.cs new file mode 100644 index 0000000..4eec97a --- /dev/null +++ b/KestrelMinima/Utils/SpscRecvRing.cs @@ -0,0 +1,105 @@ +using System.Runtime.CompilerServices; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace KestrelMinima.Utils; + +public sealed unsafe class SpscRecvRing +{ + public struct Item + { + public byte* Ptr; + public ushort Bid; + public int Len; + public bool HasBuffer; + public ushort Gen; // connection generation when enqueued (incremental return guard) + + public ReadOnlySpan AsSpan() => new(Ptr, Len); + + public UnmanagedMemoryManager AsMemoryManager() => new(Ptr, Len, Bid); + } + + private readonly Item[] _items; + private readonly int _mask; + private long _tail; + private long _head; + + public SpscRecvRing(int capacityPow2) + { + if (capacityPow2 <= 0 || (capacityPow2 & (capacityPow2 - 1)) != 0) + { + throw new ArgumentException("capacity must be a power of two", nameof(capacityPow2)); + } + + _items = new Item[capacityPow2]; + _mask = capacityPow2 - 1; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryEnqueue(in Item item) + { + long head = Volatile.Read(ref _head); + long tail = _tail; + + if ((ulong)(tail - head) >= (ulong)_items.Length) + { + return false; + } + + _items[(int)(tail & _mask)] = item; + Volatile.Write(ref _tail, tail + 1); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeue(out Item item) + { + long head = _head; + long tail = Volatile.Read(ref _tail); + + if (head >= tail) + { + item = default; + return false; + } + + item = _items[(int)(head & _mask)]; + Volatile.Write(ref _head, head + 1); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long SnapshotTail() => Volatile.Read(ref _tail); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeueUntil(long tailSnapshot, out Item item) + { + long head = _head; + + if (head >= tailSnapshot) + { + item = default; + return false; + } + + item = _items[(int)(head & _mask)]; + Volatile.Write(ref _head, head + 1); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool IsEmpty() => Volatile.Read(ref _head) >= Volatile.Read(ref _tail); + + // Reactor-thread-only, called during connection teardown (Clear) when no + // handler is consuming. Discards any leftover items so the recycled + // connection starts empty. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Reset() + { + _head = 0; + _tail = 0; + } +} diff --git a/KestrelMinima/Utils/UnmanagedMemoryManager.cs b/KestrelMinima/Utils/UnmanagedMemoryManager.cs new file mode 100644 index 0000000..794f389 --- /dev/null +++ b/KestrelMinima/Utils/UnmanagedMemoryManager.cs @@ -0,0 +1,32 @@ +using System.Buffers; + +namespace KestrelMinima.Utils; + +public sealed unsafe class UnmanagedMemoryManager : MemoryManager +{ + private readonly byte* _ptr; + private readonly int _length; + + public ushort BufferId { get; } + + public UnmanagedMemoryManager(byte* ptr, int length) + { + _ptr = ptr; + _length = length; + } + + public UnmanagedMemoryManager(byte* ptr, int length, ushort bufferId) + { + _ptr = ptr; + _length = length; + BufferId = bufferId; + } + + public override Span GetSpan() => new(_ptr, _length); + + public override MemoryHandle Pin(int elementIndex = 0) => new(_ptr + elementIndex); + + public override void Unpin() { } + + protected override void Dispose(bool disposing) { } +} diff --git a/KestrelMinima/_usings.cs b/KestrelMinima/_usings.cs new file mode 100644 index 0000000..085b05b --- /dev/null +++ b/KestrelMinima/_usings.cs @@ -0,0 +1,10 @@ +global using System; +global using System.Buffers; +global using System.Collections.Generic; +global using System.IO.Pipelines; +global using System.Runtime.CompilerServices; +global using System.Runtime.InteropServices; +global using System.Threading; +global using System.Threading.Channels; +global using System.Threading.Tasks; +global using System.Threading.Tasks.Sources; diff --git a/KestrelMinima/io_uring/Native.cs b/KestrelMinima/io_uring/Native.cs new file mode 100644 index 0000000..ca19e4c --- /dev/null +++ b/KestrelMinima/io_uring/Native.cs @@ -0,0 +1,162 @@ +using System.Runtime.InteropServices; + +namespace KestrelMinima; + +/// +/// All native interop in one file: io_uring syscalls, libc socket calls, +/// the kernel struct layouts they expect, and the constants needed to +/// drive a minimal io_uring loop. +/// +public static unsafe class Native { + private const long SYS_IO_URING_SETUP = 425; + private const long SYS_IO_URING_ENTER = 426; + private const long SYS_IO_URING_REGISTER = 427; + + public const byte IORING_OP_POLL_ADD = 6; + public const byte IORING_OP_ACCEPT = 13; + public const byte IORING_OP_SEND = 26; + public const byte IORING_OP_RECV = 27; + public const uint IORING_ENTER_GETEVENTS = 1u << 0; + public const long IORING_OFF_SQ_RING = 0; + public const long IORING_OFF_SQES = 0x10000000; + + // Multishot / buffer-ring goodies. + public const ushort IORING_ACCEPT_MULTISHOT = 1 << 0; + public const ushort IORING_RECV_MULTISHOT = 1 << 1; + public const byte IOSQE_BUFFER_SELECT = 1 << 5; + public const uint IORING_CQE_F_BUFFER = 1u << 0; + public const uint IORING_CQE_F_MORE = 1u << 1; + public const int IORING_CQE_BUFFER_SHIFT = 16; + public const uint IORING_REGISTER_PBUF_RING = 22; + public const uint IORING_UNREGISTER_PBUF_RING = 23; + public const uint IORING_POLL_ADD_MULTI = 1u << 0; + + // Incremental provided-buffer consumption (kernel 6.12+). IOU_PBUF_RING_INC + // is set in io_uring_buf_reg.flags at registration; IORING_CQE_F_BUF_MORE is + // set on recv CQEs while the kernel will keep appending to the same buffer. + public const ushort IOU_PBUF_RING_INC = 2; + public const uint IORING_CQE_F_BUF_MORE = 1u << 4; + + // eventfd flags + poll mask (used for the cross-thread wake mechanism). + public const int EFD_CLOEXEC = 0x80000; + public const int EFD_NONBLOCK = 0x800; + public const uint POLLIN = 0x0001; + + // Setup flags. SINGLE_ISSUER tells the kernel only one thread will submit + // to this ring (skips locking on the SQ). DEFER_TASKRUN defers completion + // processing until io_uring_enter(GETEVENTS), which lets the kernel batch + // work and avoids interrupting the reactor with task_work mid-flight. + public const uint IORING_SETUP_SINGLE_ISSUER = 1u << 12; + public const uint IORING_SETUP_DEFER_TASKRUN = 1u << 13; + + public const int PROT_READ = 1; + public const int PROT_WRITE = 2; + public const int MAP_SHARED = 1; + public const int MAP_POPULATE = 0x8000; + + public const int AF_INET = 2; + public const int SOCK_STREAM = 1; + public const int SOL_SOCKET = 1; + public const int SO_REUSEADDR = 2; + public const int SO_REUSEPORT = 15; + public const int IPPROTO_TCP = 6; + public const int TCP_NODELAY = 1; + + [DllImport("libc", EntryPoint = "syscall")] + private static extern long syscall3(long nr, uint a1, IoUringParams* a2); + + [DllImport("libc", EntryPoint = "syscall")] + private static extern long syscall6(long nr, uint a1, uint a2, uint a3, uint a4, void* a5, nuint a6); + + [DllImport("libc", EntryPoint = "syscall", SetLastError = true)] + private static extern long syscall4(long nr, uint a1, uint a2, void* a3, uint a4); + + public static int io_uring_setup(uint entries, IoUringParams* p) => + (int)syscall3(SYS_IO_URING_SETUP, entries, p); + + public static int io_uring_enter(int fd, uint toSubmit, uint minComplete, uint flags) => + (int)syscall6(SYS_IO_URING_ENTER, (uint)fd, toSubmit, minComplete, flags, null, 0); + + public static int io_uring_register(int fd, uint opcode, void* arg, uint nrArgs) => + (int)syscall4(SYS_IO_URING_REGISTER, (uint)fd, opcode, arg, nrArgs); + + [DllImport("libc")] public static extern void* mmap(void* addr, nuint length, int prot, int flags, int fd, long offset); + [DllImport("libc")] public static extern int munmap(void* addr, nuint length); + [DllImport("libc")] public static extern int close(int fd); + [DllImport("libc")] public static extern int socket(int domain, int type, int proto); + [DllImport("libc")] public static extern int bind(int fd, sockaddr_in* addr, uint len); + [DllImport("libc")] public static extern int listen(int fd, int backlog); + [DllImport("libc")] public static extern int setsockopt(int fd, int level, int optname, void* optval, uint optlen); + [DllImport("libc")] public static extern int eventfd(uint initval, int flags); + [DllImport("libc")] public static extern long write(int fd, void* buf, nuint count); + [DllImport("libc")] public static extern long read(int fd, void* buf, nuint count); + + public static ushort Htons(ushort x) => (ushort)((x << 8) | (x >> 8)); + + // Kernel struct layouts (must match include/uapi/linux/io_uring.h) + [StructLayout(LayoutKind.Sequential)] + public struct SqRingOffsets { + public uint head, tail, ring_mask, ring_entries, flags, dropped, array, resv1; + public ulong resv2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct CqRingOffsets { + public uint head, tail, ring_mask, ring_entries, overflow, cqes, flags, resv1; + public ulong resv2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct IoUringParams { + public uint sq_entries, cq_entries, flags, sq_thread_cpu, sq_thread_idle; + public uint features, wq_fd, resv0, resv1, resv2; + public SqRingOffsets sq_off; + public CqRingOffsets cq_off; + } + + [StructLayout(LayoutKind.Explicit, Size = 64)] + public struct IoUringSqe { + [FieldOffset(0)] public byte opcode; + [FieldOffset(1)] public byte flags; + [FieldOffset(2)] public ushort ioprio; + [FieldOffset(4)] public int fd; + [FieldOffset(8)] public ulong off; + [FieldOffset(16)] public ulong addr; + [FieldOffset(24)] public uint len; + [FieldOffset(28)] public uint op_flags; + [FieldOffset(32)] public ulong user_data; + [FieldOffset(40)] public ushort buf_index; + [FieldOffset(42)] public ushort personality; + [FieldOffset(44)] public int splice_fd_in; + [FieldOffset(48)] public ulong addr3; + [FieldOffset(56)] public ulong __pad2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct IoUringCqe { + public ulong user_data; + public int res; + public uint flags; + } + + // Argument struct for IORING_REGISTER_PBUF_RING. + [StructLayout(LayoutKind.Sequential)] + public struct io_uring_buf_reg { + public ulong ring_addr; + public uint ring_entries; + public ushort bgid; + public ushort flags; + public ulong resv1, resv2, resv3; + } + + [StructLayout(LayoutKind.Sequential)] + public struct in_addr { public uint s_addr; } + + [StructLayout(LayoutKind.Sequential)] + public unsafe struct sockaddr_in { + public ushort sin_family; + public ushort sin_port; + public in_addr sin_addr; + public fixed byte sin_zero[8]; + } +} diff --git a/KestrelMinima/io_uring/Ring.cs b/KestrelMinima/io_uring/Ring.cs new file mode 100644 index 0000000..240c423 --- /dev/null +++ b/KestrelMinima/io_uring/Ring.cs @@ -0,0 +1,179 @@ +using System.Runtime.CompilerServices; +using static KestrelMinima.Native; + +// ReSharper disable SuggestVarOrType_BuiltInTypes +// ReSharper disable SuggestVarOrType_Elsewhere +#pragma warning disable CA1806 + +namespace KestrelMinima; + +public sealed unsafe class Ring : IDisposable +{ + private int _fd; + + public int Fd => _fd; + + private uint* _sqHead; + private uint* _sqTail; + private uint* _sqArray; + private uint _sqMask; + private uint _sqEntries; + private IoUringSqe* _sqes; + + private uint* _cqHead; + private uint* _cqTail; + private IoUringCqe* _cqes; + private uint _cqMask; + + private uint _sqeTail; + + private byte* _ringPtr; + private nuint _ringSize; + private byte* _sqePtr; + private nuint _sqeSize; + + public static Ring Create(uint entries) + { + IoUringParams ioUringParams = default; + ioUringParams.flags = IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN; + int fd = io_uring_setup(entries, &ioUringParams); + if (fd < 0) + { + throw new InvalidOperationException($"io_uring_setup failed: {fd}"); + } + + var ring = new Ring + { + _fd = fd, + _sqEntries = ioUringParams.sq_entries + }; + + nuint sqRingBytes = ioUringParams.sq_off.array + ioUringParams.sq_entries * sizeof(uint); + nuint cqRingBytes = ioUringParams.cq_off.cqes + ioUringParams.cq_entries * (nuint)sizeof(IoUringCqe); + nuint ringBytes = sqRingBytes > cqRingBytes ? sqRingBytes : cqRingBytes; + + void* ringMem = mmap(null, ringBytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); + if (ringMem == (void*)-1) + { + close(fd); + + throw new InvalidOperationException("mmap(SQ_RING) failed"); + } + ring._ringPtr = (byte*)ringMem; + ring._ringSize = ringBytes; + + nuint sqeBytes = ioUringParams.sq_entries * (nuint)sizeof(IoUringSqe); + void* sqeMem = mmap(null, sqeBytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); + if (sqeMem == (void*)-1) + { + munmap(ringMem, ringBytes); + close(fd); + + throw new InvalidOperationException("mmap(SQES) failed"); + } + ring._sqes = (IoUringSqe*)sqeMem; + ring._sqePtr = (byte*)sqeMem; + ring._sqeSize = sqeBytes; + + byte* ringPointer = (byte*)ringMem; + ring._sqHead = (uint*)(ringPointer + ioUringParams.sq_off.head); + ring._sqTail = (uint*)(ringPointer + ioUringParams.sq_off.tail); + ring._sqArray = (uint*)(ringPointer + ioUringParams.sq_off.array); + ring._sqMask = *(uint*)(ringPointer + ioUringParams.sq_off.ring_mask); + + ring._cqHead = (uint*)(ringPointer + ioUringParams.cq_off.head); + ring._cqTail = (uint*)(ringPointer + ioUringParams.cq_off.tail); + ring._cqes = (IoUringCqe*)(ringPointer + ioUringParams.cq_off.cqes); + ring._cqMask = *(uint*)(ringPointer + ioUringParams.cq_off.ring_mask); + + return ring; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public IoUringSqe* GetSqe() + { + uint head = Volatile.Read(ref *_sqHead); + + if (_sqeTail - head >= _sqEntries) + { + return null; + } + + uint slot = _sqeTail & _sqMask; + _sqArray[slot] = slot; + _sqeTail++; + + return &_sqes[slot]; + } + + public int SubmitAndWait(uint waitFor) + { + uint published = *_sqTail; + uint toSubmit = _sqeTail - published; + + if (toSubmit > 0) + { + Volatile.Write(ref *_sqTail, _sqeTail); + } + + if (toSubmit == 0 && waitFor == 0) return 0; + + uint flags = waitFor > 0 ? IORING_ENTER_GETEVENTS : 0; + + return io_uring_enter(_fd, toSubmit, waitFor, flags); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryGetCqe(out IoUringCqe cqe) + { + uint head = *_cqHead; + uint tail = Volatile.Read(ref *_cqTail); + + if (head == tail) + { + cqe = default; + + return false; + } + + cqe = _cqes[head & _cqMask]; + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void CqeSeen() => Volatile.Write(ref *_cqHead, *_cqHead + 1); + + // Batched CQ drain (liburing io_uring_for_each_cqe + io_uring_cq_advance): + // read the kernel-written tail once (acquire), process the whole batch, + // then publish the consumed head once (release) instead of once per CQE. + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint CqReady() => Volatile.Read(ref *_cqTail) - *_cqHead; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ref readonly IoUringCqe CqeAt(uint i) => ref _cqes[(*_cqHead + i) & _cqMask]; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void CqAdvance(uint n) => Volatile.Write(ref *_cqHead, *_cqHead + n); + + public void Dispose() + { + if (_ringPtr != null) + { + munmap(_ringPtr, _ringSize); _ringPtr = null; + } + + if (_sqePtr != null) + { + munmap(_sqePtr, _sqeSize); _sqePtr = null; + } + + if (_fd > 0) + { + close(_fd); _fd = 0; + } + } +} + +#pragma warning restore CA1806 diff --git a/Minima/Program.cs b/Minima/Program.cs index c4e38a0..241abaa 100644 --- a/Minima/Program.cs +++ b/Minima/Program.cs @@ -23,7 +23,7 @@ private static int Main() var config = new ServerConfig() { UsePipe = false, - ReactorCount = 24 + ReactorCount = 12 }; Console.WriteLine($"[Minima] starting {config.ReactorCount} reactors on port {config.Port} (incremental={config.Incremental})"); @@ -55,7 +55,7 @@ internal static class Handler // Real async-work knob: serialize an in-memory object of WORK_ITEMS elements to JSON // on the THREAD POOL (via Task.Run) per request. 0 / unset = disabled (pure inline // reactor path). Genuine CPU + allocation, not a busy-spin. - private static readonly int WorkItems = 50; + private static readonly int WorkItems = 1; private static readonly Payload LargeObject = BuildPayload(Math.Max(WorkItems, 1)); @@ -91,16 +91,18 @@ public static async Task HandleAsync(Reactor reactor, Connection conn) conn.ReturnBuffer(in item); } } - + + _ = await Task.Run(static () => JsonSerializer.Serialize("Hello World!")); + // Real async work: serialize a large object to JSON on the THREAD POOL. // The handler resumes OFF-REACTOR, so the FlushAsync below pays the eventfd // handoff the pure-inline path avoids — and the serialization is genuine // CPU + GC pressure on the pool, not a busy-spin. - if (WorkItems > 0) + /*if (WorkItems > 0) { - //_ = await Task.Run(static () => JsonSerializer.SerializeToUtf8Bytes(LargeObject)); - JsonSerializer.SerializeToUtf8Bytes(LargeObject); - } + _ = await Task.Run(static () => JsonSerializer.SerializeToUtf8Bytes(LargeObject)); + //JsonSerializer.SerializeToUtf8Bytes(LargeObject); + }*/ // One response per recv burst — accumulate in the connection's // per-connection write slab, then submit and await ack. diff --git a/Minima/Reactor/Reactor.cs b/Minima/Reactor/Reactor.cs index 0346f62..7f51e32 100644 --- a/Minima/Reactor/Reactor.cs +++ b/Minima/Reactor/Reactor.cs @@ -170,7 +170,7 @@ public void EnqueueReturnQ(ushort bid) { sw.SpinOnce(); } - WakeFdWrite(); + //WakeFdWrite(); } internal void EnqueueFlush(int fd) diff --git a/MinimaSQPoll/Connection/Connection.Read.cs b/MinimaSQPoll/Connection/Connection.Read.cs new file mode 100644 index 0000000..444e1ae --- /dev/null +++ b/MinimaSQPoll/Connection/Connection.Read.cs @@ -0,0 +1,167 @@ +using System.Threading.Tasks.Sources; +using MinimaSQPoll.Utils; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaSQPoll; + +/// +/// Per-connection state. The handler may run on any thread (e.g. resumed by +/// a thread-pool timer); reactor-only side effects are funnelled through the +/// MPSC queues on `Reactor`. Coordination uses Interlocked.Exchange on the +/// arm flags and a sticky `_pending` to close the lost-wakeup race. +/// +/// Lifetime is pool-managed: the reactor pops a Connection on accept (or new +/// one if pool is empty), and pushes it back on teardown after `Clear()`. The +/// `_generation` field is bumped on each `Clear` so stale `ValueTask` tokens +/// from a previous connection life are detectable and return `Closed()` +/// instead of leaking the new tenant's state. +/// +public sealed unsafe partial class Connection : IValueTaskSource +{ + internal Connection SetFd(int fd) + { + ClientFd = fd; + return this; + } + + private ManualResetValueTaskSourceCore _readSignal = new() + { + RunContinuationsAsynchronously = true, + }; + + private int _armed; + private int _pending; + private int _closed; + + private readonly SpscRecvRing _recv = new(capacityPow2: 16); + + public ValueTask ReadAsync() + { + if (!_recv.IsEmpty() || Volatile.Read(ref _pending) == 1) + { + Volatile.Write(ref _pending, 0); + return new ValueTask( + new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + + if (Volatile.Read(ref _closed) != 0) + { + return new ValueTask(RecvSnapshot.Closed()); + } + + if (Interlocked.Exchange(ref _armed, 1) == 1) + { + throw new InvalidOperationException("ReadAsync already armed."); + } + + // Snapshot the generation as the IVTS token so a future Clear() can + // invalidate this awaiter if the connection gets pool-recycled. + int gen = Volatile.Read(ref _generation); + + // Race recovery: re-check between arming and returning the IVTS task. + if (!_recv.IsEmpty() || Volatile.Read(ref _pending) == 1 || Volatile.Read(ref _closed) != 0) + { + Volatile.Write(ref _pending, 0); + Interlocked.Exchange(ref _armed, 0); + + return new ValueTask( + new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + + return new ValueTask(this, (short)gen); + } + + public bool TryGetItem(in RecvSnapshot snap, out SpscRecvRing.Item item) + => _recv.TryDequeueUntil(snap.Tail, out item); + + public void ResetRead() => _readSignal.Reset(); + + public void Complete(int res, ushort bid, bool hasBuffer, byte* ptr) + { + if (!_recv.TryEnqueue(new SpscRecvRing.Item + { + Ptr = ptr, + Bid = bid, + Len = res, + HasBuffer = hasBuffer, + Gen = (ushort)Volatile.Read(ref _generation) + })) + { + Console.Error.WriteLine("[conn] recv queue overflow."); + if (hasBuffer) + { + _reactor.ReturnBufferDirect(bid); + } + Volatile.Write(ref _closed, 1); + } + + if (Interlocked.Exchange(ref _armed, 0) == 1) + { + _readSignal.SetResult(new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + else + { + Volatile.Write(ref _pending, 1); + } + } + + internal void DrainRecv() + { + // Return any buffer IDs still sitting in the SPSC ring (handler exited + // before draining them, or a recv arrived after _closed was set). + while (_recv.TryDequeue(out SpscRecvRing.Item item)) + { + if (item.HasBuffer) + { + _reactor.ReturnBufferDirect(item.Bid); + } + } + } + + // ========================================================================= + // IValueTaskSource plumbing — token (= snapshot of `_generation` at await + // time) is compared against the current `_generation` to detect stale + // awaiters from before a Clear()/pool reuse. Stale awaiters get a + // sentinel result rather than the new tenant's state. + // + // For the actual IVTS dispatch we pass `_readSignal.Version` / + // `_flushSignal.Version` to the underlying core (not `token`) because the + // core's version is bumped by ResetRead/CompleteFlush mid-life and is + // unrelated to the cross-life generation guard. + // ========================================================================= + + RecvSnapshot IValueTaskSource.GetResult(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return RecvSnapshot.Closed(); + } + + return _readSignal.GetResult(_readSignal.Version); + } + + ValueTaskSourceStatus IValueTaskSource.GetStatus(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return ValueTaskSourceStatus.Succeeded; + } + + return _readSignal.GetStatus(_readSignal.Version); + } + + void IValueTaskSource.OnCompleted(Action continuation, object? state, short token, ValueTaskSourceOnCompletedFlags flags) + { + if (token != (short)Volatile.Read(ref _generation)) + { + // Stale — run the continuation now so the awaiter unblocks and + // gets RecvSnapshot.Closed() from GetResult. + continuation(state); + + return; + } + + _readSignal.OnCompleted(continuation, state, _readSignal.Version, flags); + } +} diff --git a/MinimaSQPoll/Connection/Connection.Write.cs b/MinimaSQPoll/Connection/Connection.Write.cs new file mode 100644 index 0000000..3ea08b4 --- /dev/null +++ b/MinimaSQPoll/Connection/Connection.Write.cs @@ -0,0 +1,185 @@ +using System.Buffers; +using System.Threading.Tasks.Sources; +using MinimaSQPoll.Utils; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaSQPoll; + +public sealed unsafe partial class Connection : IValueTaskSource, IBufferWriter +{ + private readonly int _writeSlabSize; + internal byte* WriteBuffer; + internal int WriteHead; + internal int WriteTail; + internal int WriteInFlight; + + private readonly UnmanagedMemoryManager _manager; + + private ManualResetValueTaskSourceCore _flushSignal = new() + { + RunContinuationsAsynchronously = true, + }; + private int _flushArmed; + private int _flushInProgress; + + // IBufferWrite +#region IBufferWrite + + public Memory GetMemory(int sizeHint = 0) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + int remaining = _writeSlabSize - WriteTail; + if (sizeHint > remaining) + { + throw new InvalidOperationException("Buffer too small."); + } + + return _manager.Memory.Slice(WriteTail, remaining); + } + + public Span GetSpan(int sizeHint = 0) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + if (WriteTail + sizeHint > _writeSlabSize) + { + throw new InvalidOperationException("Write buffer too small."); + } + + return new Span(WriteBuffer + WriteTail, _writeSlabSize - WriteTail); + } + + public void Advance(int count) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + WriteTail += count; + } + +#endregion + + // Write to the inner buffer + public void Write(ReadOnlySpan source) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + int len = source.Length; + if (WriteTail + len > _writeSlabSize) + { + throw new InvalidOperationException("Write buffer too small."); + } + + source.CopyTo(new Span(WriteBuffer + WriteTail, len)); + WriteTail += len; + } + + // Flush inner buffer data to the kernel + public ValueTask FlushAsync() + { + // Connection already torn down (reactor saw EOF/error → MarkClosed): don't flush + // a removed connection — the handoff would reach a reactor that no longer knows + // this fd and the awaiter would hang. Return completed so the handler unwinds to + // its next ReadAsync, sees IsClosed, and exits. + if (Volatile.Read(ref _closed) == 1) + { + return default; + } + + if (Interlocked.Exchange(ref _flushInProgress, 1) == 1) + { + throw new InvalidOperationException("FlushAsync already in progress."); + } + + int target = WriteTail; + if (target == 0) + { + Volatile.Write(ref _flushInProgress, 0); + + return default; + } + + if (Interlocked.Exchange(ref _flushArmed, 1) == 1) + { + throw new InvalidOperationException("FlushAsync already armed."); + } + + _flushSignal.Reset(); + WriteInFlight = target; + + int gen = Volatile.Read(ref _generation); + + _reactor.EnqueueFlush(this); + + // Race recovery (mirrors ReadAsync): if close raced in after the guard above, + // self-complete so we don't hang waiting on a send the reactor will never make. + if (Volatile.Read(ref _closed) == 1 && Interlocked.Exchange(ref _flushArmed, 0) == 1) + { + Volatile.Write(ref _flushInProgress, 0); + _flushSignal.SetResult(true); + } + + return new ValueTask(this, (short)gen); + } + + // Signal the FlushAsync was completed, called by the reactor's dispatcher send branch + internal void CompleteFlush() + { + WriteHead = 0; + WriteTail = 0; + WriteInFlight = 0; + Volatile.Write(ref _flushInProgress, 0); + Interlocked.Exchange(ref _flushArmed, 0); + + _flushSignal.SetResult(true); + } + + // IValueTaskSource +#region IValueTaskSource + + void IValueTaskSource.GetResult(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return; + } + + _flushSignal.GetResult(_flushSignal.Version); + } + + ValueTaskSourceStatus IValueTaskSource.GetStatus(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return ValueTaskSourceStatus.Succeeded; + } + + return _flushSignal.GetStatus(_flushSignal.Version); + } + + void IValueTaskSource.OnCompleted(Action continuation, object? state, short token, ValueTaskSourceOnCompletedFlags flags) + { + if (token != (short)Volatile.Read(ref _generation)) + { + continuation(state); + + return; + } + _flushSignal.OnCompleted(continuation, state, _flushSignal.Version, flags); + } + +#endregion +} \ No newline at end of file diff --git a/MinimaSQPoll/Connection/Connection.cs b/MinimaSQPoll/Connection/Connection.cs new file mode 100644 index 0000000..a2fa671 --- /dev/null +++ b/MinimaSQPoll/Connection/Connection.cs @@ -0,0 +1,111 @@ +using System.Runtime.InteropServices; +using MinimaSQPoll.Utils; + +namespace MinimaSQPoll; + +public sealed unsafe partial class Connection +{ + private readonly Reactor _reactor; + + public int ClientFd { get; private set; } + + // Bumped on Clear(); the low 16 bits are used as the IVTS token so stale + // awaiters can be detected after pool reuse. + private int _generation; + + // Refcount: the connection has two owners — the reactor (recv side) and the + // handler (which may run off-reactor). Init to 2 on accept; each owner DecRef's + // when done; teardown (Recycle) runs only at refs==0, so a connection is never + // recycled or pool-reused while a handler is still in flight on another thread. + private int _refs; + + public Connection(Reactor reactor, int fd, int writeSlabSize = 1024 * 16) + { + _reactor = reactor; + ClientFd = fd; + _writeSlabSize = writeSlabSize; + WriteBuffer = (byte*)NativeMemory.AlignedAlloc((nuint)writeSlabSize, 64); + + _manager = new UnmanagedMemoryManager(WriteBuffer, writeSlabSize); + } + + // ========================================================================= + // Pool lifecycle — invoked from Reactor.Dispatch's recv/send error paths. + // Reactor-thread only. + // + // teardown: MarkClosed() → wake awaiters with closed=1 + // DrainRecv() → return any in-flight buf_ring items + // close(fd) + // Clear() → reset state, bump _generation + // push to pool, OR Dispose() if pool is full + // ========================================================================= + + public void MarkClosed() + { + Volatile.Write(ref _closed, 1); + + if (Interlocked.Exchange(ref _armed, 0) == 1) + { + _readSignal.SetResult(new RecvSnapshot(_recv.SnapshotTail(), isClosed: true)); + } + else + { + Volatile.Write(ref _pending, 1); + } + + if (Interlocked.Exchange(ref _flushArmed, 0) == 1) + { + Volatile.Write(ref _flushInProgress, 0); + _flushSignal.SetResult(true); + } + } + + // Init to 2 (reactor + handler) at accept. + internal void InitRefs() => Volatile.Write(ref _refs, 2); + + // Release one owner's ref. Whoever drives it to 0 hands the connection to the + // reactor for teardown (close + Clear + pool) — never recycled before both done. + internal void DecRef() + { + if (Interlocked.Decrement(ref _refs) == 0) + { + _reactor.EnqueueRecycle(this); + } + } + + internal void Clear() + { + // Bump generation first — readers of IVTS plumbing observe this via + // Volatile.Read and stale tokens get RecvSnapshot.Closed() / no-op. + Interlocked.Increment(ref _generation); + + Volatile.Write(ref _armed, 0); + Volatile.Write(ref _pending, 0); + Volatile.Write(ref _closed, 0); + Volatile.Write(ref _flushArmed, 0); + Volatile.Write(ref _flushInProgress, 0); + + WriteHead = 0; + WriteTail = 0; + WriteInFlight = 0; + + _readSignal.Reset(); + _flushSignal.Reset(); + + _recv.Reset(); // discard any leftover SPSC items + } + + public void Dispose() + { + if (WriteBuffer != null) + { + NativeMemory.AlignedFree(WriteBuffer); + WriteBuffer = null; + } + } + + // Convenience: hand a buffer back to the reactor's shared buf_ring. Lives + // here (instead of the deleted Connection.Incremental.cs) because the raw + // handler API in Program.cs and ConnectionPipeReader both call it. + public void ReturnBuffer(in SpscRecvRing.Item item) => _reactor.EnqueueReturnQ(item.Bid); +} \ No newline at end of file diff --git a/MinimaSQPoll/Connection/ConnectionDualPipe.cs b/MinimaSQPoll/Connection/ConnectionDualPipe.cs new file mode 100644 index 0000000..3063eeb --- /dev/null +++ b/MinimaSQPoll/Connection/ConnectionDualPipe.cs @@ -0,0 +1,16 @@ +using System.IO.Pipelines; + +namespace MinimaSQPoll; + +public sealed class ConnectionDualPipe : IDuplexPipe +{ + public PipeReader Input { get; } + public PipeWriter Output { get; } + + public ConnectionDualPipe(Connection connection) + { + ArgumentNullException.ThrowIfNull(connection); + Input = new ConnectionPipeReader(connection); + Output = new ConnectionPipeWriter(connection); + } +} \ No newline at end of file diff --git a/MinimaSQPoll/Connection/ConnectionPipeReader.cs b/MinimaSQPoll/Connection/ConnectionPipeReader.cs new file mode 100644 index 0000000..8a9de71 --- /dev/null +++ b/MinimaSQPoll/Connection/ConnectionPipeReader.cs @@ -0,0 +1,181 @@ +using System.Buffers; +using System.IO.Pipelines; +using MinimaSQPoll.Utils; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaSQPoll; + +/// +/// Adapts Minima's raw read API (ReadAsync + TryGetItem +/// + ReturnBuffer) to a standard . Recv buffers are +/// exposed zero-copy as a ReadOnlySequence<byte> (one segment per buffer) +/// and held until AdvanceTo consumes them, at which point fully-consumed buffers +/// are returned to the reactor. +/// +/// Convenience/compat layer for PipeReader consumers — the raw ReadAsync/ +/// TryGetItem path stays the faster one (this adds held-buffer + sequence +/// bookkeeping per read). +/// +public sealed class ConnectionPipeReader : PipeReader +{ + private readonly Connection _conn; + private readonly List _held = new(16); + private ReadOnlySequence _lastSequence; + + private bool _completed; + private bool _cancelRequested; + private bool _connectionClosed; + + private readonly struct Held + { + public readonly ReadOnlyMemory Memory; + public readonly SpscRecvRing.Item Item; + + public Held(ReadOnlyMemory memory, SpscRecvRing.Item item) + { + Memory = memory; + Item = item; + } + + public Held WithMemory(ReadOnlyMemory memory) => new(memory, Item); + } + + public ConnectionPipeReader(Connection connection) + { + _conn = connection ?? throw new ArgumentNullException(nameof(connection)); + } + + public override async ValueTask ReadAsync(CancellationToken cancellationToken = default) + { + ThrowIfCompleted(); + + if (_cancelRequested) + { + _cancelRequested = false; + return new ReadResult(BuildSequence(), isCanceled: true, isCompleted: _connectionClosed); + } + + // Anything still held from a previous read that wasn't fully consumed. + if (_held.Count > 0) + return new ReadResult(BuildSequence(), isCanceled: false, isCompleted: _connectionClosed); + + if (_connectionClosed) + return new ReadResult(default, isCanceled: false, isCompleted: true); + + RecvSnapshot snap = await _conn.ReadAsync(); + + while (_conn.TryGetItem(snap, out SpscRecvRing.Item item)) + { + if (item.HasBuffer) + _held.Add(new Held(item.AsMemoryManager().Memory, item)); + } + + _conn.ResetRead(); + + if (snap.IsClosed) + _connectionClosed = true; + + if (_cancelRequested) + { + _cancelRequested = false; + return new ReadResult(BuildSequence(), isCanceled: true, isCompleted: _connectionClosed); + } + + return new ReadResult(BuildSequence(), isCanceled: false, isCompleted: _connectionClosed); + } + + public override bool TryRead(out ReadResult result) + { + ThrowIfCompleted(); + + if (_held.Count > 0) + { + result = new ReadResult(BuildSequence(), isCanceled: false, isCompleted: _connectionClosed); + return true; + } + + if (_connectionClosed) + { + result = new ReadResult(default, isCanceled: false, isCompleted: true); + return true; + } + + result = default; + return false; + } + + public override void AdvanceTo(SequencePosition consumed) => AdvanceTo(consumed, consumed); + + public override void AdvanceTo(SequencePosition consumed, SequencePosition examined) + { + if (_held.Count == 0) + return; + + long consumedBytes = _lastSequence.Slice(0, consumed).Length; + + while (_held.Count > 0 && consumedBytes > 0) + { + Held seg = _held[0]; + int available = seg.Memory.Length; + + if (consumedBytes >= available) + { + // Whole buffer consumed — return it to the reactor. + _conn.ReturnBuffer(seg.Item); + _held.RemoveAt(0); + consumedBytes -= available; + } + else + { + // Partial — keep the unconsumed tail of this buffer. + _held[0] = seg.WithMemory(seg.Memory[(int)consumedBytes..]); + consumedBytes = 0; + } + } + } + + public override void CancelPendingRead() => _cancelRequested = true; + + public override void Complete(Exception? exception = null) + { + if (_completed) + return; + + _completed = true; + + for (int i = 0; i < _held.Count; i++) + _conn.ReturnBuffer(_held[i].Item); + + _held.Clear(); + } + + private ReadOnlySequence BuildSequence() + { + if (_held.Count == 0) + { + _lastSequence = default; + return _lastSequence; + } + + if (_held.Count == 1) + { + _lastSequence = new ReadOnlySequence(_held[0].Memory); + return _lastSequence; + } + + var head = new RingSegment(_held[0].Memory, _held[0].Item.Bid); + RingSegment tail = head; + + for (int i = 1; i < _held.Count; i++) + tail = tail.Append(_held[i].Memory, _held[i].Item.Bid); + + _lastSequence = new ReadOnlySequence(head, 0, tail, tail.Memory.Length); + return _lastSequence; + } + + private void ThrowIfCompleted() + { + if (_completed) + throw new InvalidOperationException("Reading is not allowed after the reader was completed."); + } +} diff --git a/MinimaSQPoll/Connection/ConnectionPipeWriter.cs b/MinimaSQPoll/Connection/ConnectionPipeWriter.cs new file mode 100644 index 0000000..8a6a4c4 --- /dev/null +++ b/MinimaSQPoll/Connection/ConnectionPipeWriter.cs @@ -0,0 +1,63 @@ +using System.IO.Pipelines; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaSQPoll; + +/// +/// Adapts Minima's write API (GetMemory/GetSpan/Advance/ +/// FlushAsync) to a standard , so PipeWriter-based code +/// can write responses through the connection's per-connection slab. +/// A thin wrapper — all the work lives in Connection. +/// +public sealed class ConnectionPipeWriter : PipeWriter +{ + private readonly Connection _conn; + private bool _completed; + private bool _cancelRequested; + private long _unflushed; + + public ConnectionPipeWriter(Connection connection) + { + _conn = connection ?? throw new ArgumentNullException(nameof(connection)); + } + + public override bool CanGetUnflushedBytes => true; + public override long UnflushedBytes => _unflushed; + + public override Memory GetMemory(int sizeHint = 0) => _conn.GetMemory(sizeHint); + + public override Span GetSpan(int sizeHint = 0) => _conn.GetSpan(sizeHint); + + public override void Advance(int bytes) + { + _unflushed += bytes; + _conn.Advance(bytes); + } + + public override ValueTask FlushAsync(CancellationToken cancellationToken = default) + { + if (_cancelRequested) + { + _cancelRequested = false; + return new ValueTask(new FlushResult(isCanceled: true, isCompleted: _completed)); + } + + _unflushed = 0; + ValueTask inner = _conn.FlushAsync(); + + if (inner.IsCompletedSuccessfully) + return new ValueTask(new FlushResult(isCanceled: false, isCompleted: _completed)); + + return AwaitFlush(inner); + } + + private async ValueTask AwaitFlush(ValueTask inner) + { + await inner; + return new FlushResult(isCanceled: false, isCompleted: _completed); + } + + public override void CancelPendingFlush() => _cancelRequested = true; + + public override void Complete(Exception? exception = null) => _completed = true; +} diff --git a/MinimaSQPoll/Connection/RecvSnapshot.cs b/MinimaSQPoll/Connection/RecvSnapshot.cs new file mode 100644 index 0000000..ceaec51 --- /dev/null +++ b/MinimaSQPoll/Connection/RecvSnapshot.cs @@ -0,0 +1,15 @@ +namespace MinimaSQPoll; + +public readonly struct RecvSnapshot +{ + public readonly long Tail; + public readonly bool IsClosed; + + public RecvSnapshot(long tail, bool isClosed) + { + Tail = tail; + IsClosed = isClosed; + } + + public static RecvSnapshot Closed() => new(0, isClosed: true); +} \ No newline at end of file diff --git a/MinimaSQPoll/MinimaSQPoll.csproj b/MinimaSQPoll/MinimaSQPoll.csproj new file mode 100644 index 0000000..a38f6fa --- /dev/null +++ b/MinimaSQPoll/MinimaSQPoll.csproj @@ -0,0 +1,12 @@ + + + + Exe + net10.0 + enable + enable + true + MinimaSQPoll + + + diff --git a/MinimaSQPoll/Program.cs b/MinimaSQPoll/Program.cs new file mode 100644 index 0000000..f4ab432 --- /dev/null +++ b/MinimaSQPoll/Program.cs @@ -0,0 +1,176 @@ +using System.Buffers; +using System.IO.Pipelines; +using System.Text.Json; +using MinimaSQPoll.Utils; + +namespace MinimaSQPoll; + +/// +/// Multi-reactor HTTP/1.1 server using io_uring directly. Spawns N reactor +/// threads (one per CPU); each opens its own SO_REUSEPORT listener, runs its +/// own io_uring, owns its own connection map. The kernel load-balances new +/// connections across reactors. Per-connection state never crosses threads, +/// so no synchronization is needed on the hot path. +/// +internal static unsafe class Program +{ + internal static ReadOnlySpan Response => + "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nContent-Length: 2\r\n\r\nok"u8; + + private static int Main() + { + // All tunables live in ServerConfig — override the defaults here. + var config = new ServerConfig() + { + UsePipe = false, + ReactorCount = 6 + }; + + Console.WriteLine($"[Minima] starting {config.ReactorCount} reactors on port {config.Port} (incremental={config.Incremental})"); + + var threads = new Thread[config.ReactorCount]; + for (var i = 0; i < config.ReactorCount; i++) + { + var reactor = new Reactor(i, config); + + threads[i] = new Thread(reactor.Run) + { + Name = $"reactor-{i}", + IsBackground = false + }; + threads[i].Start(); + } + + foreach (var t in threads) + { + t.Join(); + } + + return 0; + } +} + +internal static class Handler +{ + // Real async-work knob: serialize an in-memory object of WORK_ITEMS elements to JSON + // on the THREAD POOL (via Task.Run) per request. 0 / unset = disabled (pure inline + // reactor path). Genuine CPU + allocation, not a busy-spin. + private static readonly int WorkItems = 50; + + private static readonly Payload LargeObject = BuildPayload(Math.Max(WorkItems, 1)); + + private static Payload BuildPayload(int count) + { + var items = new Item[count]; + for (int i = 0; i < count; i++) + { + items[i] = new Item(i, $"item-{i}", i * 1.5, (i & 1) == 0, $"category-{i % 8}"); + } + return new Payload(DateTime.UtcNow.ToString("O"), count, items); + } + + public static async Task HandleAsync(Reactor reactor, Connection conn) + { + try + { + while (true) + { + RecvSnapshot snap = await conn.ReadAsync(); + + while (conn.TryGetItem(snap, out SpscRecvRing.Item item)) + { + if (item.HasBuffer) + { + UnmanagedMemoryManager mem = item.AsMemoryManager(); + ReadOnlyMemory data = mem.Memory; + // data is now usable with any BCL Memory/async API + _ = data.Length; + + // Cross-thread safe and mode-agnostic: routes to the + // shared-ring return or the incremental refcounted return. + conn.ReturnBuffer(in item); + } + } + + // Real async work: serialize a large object to JSON on the THREAD POOL. + // The handler resumes OFF-REACTOR, so the FlushAsync below pays the eventfd + // handoff the pure-inline path avoids — and the serialization is genuine + // CPU + GC pressure on the pool, not a busy-spin. + /*if (WorkItems > 0) + { + //_ = await Task.Run(static () => JsonSerializer.SerializeToUtf8Bytes(LargeObject)); + JsonSerializer.SerializeToUtf8Bytes(LargeObject); + }*/ + + // One response per recv burst — accumulate in the connection's + // per-connection write slab, then submit and await ack. + conn.Write(Program.Response); + await conn.FlushAsync(); + + if (snap.IsClosed) + { + // Reactor already owns teardown (Connections.Remove + close + // happens in Dispatch's recv-error branch); we just exit. + return; + } + + conn.ResetRead(); + } + } + catch (Exception ex) + { + Console.Error.WriteLine($"[r{reactor.Id}] handler crash on fd={conn.ClientFd}: {ex}"); + // Reactor will clean the connection up via the recv-error path + // (or SPSC overflow) on the next CQE for this fd. + } + finally + { + conn.DecRef(); // release the handler's ref; teardown runs once the reactor releases too + } + } + + // PipeReader/PipeWriter variant — same behavior, driven through the BCL + // pipe adapters instead of the raw ReadAsync/TryGetItem/Write API. + public static async Task HandlePipeAsync(Reactor reactor, Connection conn) + { + var reader = new ConnectionPipeReader(conn); + var writer = new ConnectionPipeWriter(conn); + + try + { + while (true) + { + ReadResult read = await reader.ReadAsync(); + ReadOnlySequence buffer = read.Buffer; + + if (!buffer.IsEmpty) + { + // A real server would parse requests out of `buffer` here. + writer.Write(Program.Response); + await writer.FlushAsync(); + } + + // Consume everything we got; AdvanceTo returns the recv buffers. + reader.AdvanceTo(buffer.End); + + if (read.IsCompleted) + { + break; + } + } + } + catch (Exception ex) + { + Console.Error.WriteLine($"[r{reactor.Id}] pipe handler crash on fd={conn.ClientFd}: {ex}"); + } + finally + { + reader.Complete(); + writer.Complete(); + conn.DecRef(); + } + } +} + +internal sealed record Item(int Id, string Name, double Value, bool Active, string Category); +internal sealed record Payload(string Generated, int Count, Item[] Items); diff --git a/MinimaSQPoll/Reactor/Reactor.cs b/MinimaSQPoll/Reactor/Reactor.cs new file mode 100644 index 0000000..8a3bef5 --- /dev/null +++ b/MinimaSQPoll/Reactor/Reactor.cs @@ -0,0 +1,385 @@ +using System.Collections.Concurrent; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using MinimaSQPoll.Utils; +using static MinimaSQPoll.Native; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaSQPoll; + +/// +/// SQPOLL reactor: the kernel polls the SQ for us, so handler threads write +/// SQEs directly into ring memory via + +/// (SpinLock-guarded). The reactor's only job is +/// to wait for CQEs and dispatch them — no MPSC queues, no eventfd wake, no +/// drain phase. +/// +public sealed unsafe partial class Reactor +{ + public readonly int Id; + public Ring Ring = null!; // created on the reactor's own thread + public readonly ConcurrentDictionary Connections = new(); + + private int _listenFd; + private readonly ServerConfig _config; + private readonly ushort _port; + private readonly uint _ringEntries; + private readonly bool _incremental; + private readonly uint RecvBufferSize; + + // CQE user_data layout: kind tag in the high 32 bits, fd in the low 32. + private const ulong KindAccept = 1UL << 32; + private const ulong KindRecv = 2UL << 32; + private const ulong KindSend = 3UL << 32; + + // Provided-buffer ring (one per reactor, shared by all its connections). + private const ushort BgId = 1; + private readonly uint BufferRingEntries; // power of two + private byte* _bufRing; // io_uring_buf_ring (kernel-shared) + private byte* _bufSlab; // contiguous slab of recv buffers + private uint _bufRingMask; + private ushort _bufRingTail; + + // Guards multi-threaded updates to the buf_ring tail. Critical section is a + // 16-byte write + a ushort tail bump, so SpinLock is the right primitive. + private SpinLock _bufRingLock = new SpinLock(false); + + // Connection pool: accept runs on the reactor, recycle can run on any + // thread (handler refcount → 0 off-reactor), so use the MPMC variant. + private readonly int PoolMax; + private readonly ConcurrentStack _pool = new(); + + // Incremental-mode (IOU_PBUF_RING_INC) sizing. + private readonly int MaxConnections; + private readonly int ConnBufRingEntries; + private readonly uint IncRecvBufferSize; + + // Transient io_uring_enter errnos (Linux): interrupted, would-block, busy. + private const int EINTR = 4; + private const int EAGAIN = 11; + private const int EBUSY = 16; + + public Reactor(int id, ServerConfig config) + { + Id = id; + _config = config; + _port = config.Port; + _ringEntries = config.RingEntries; + _incremental = config.Incremental; + RecvBufferSize = (uint)config.RecvBufferSize; + BufferRingEntries = (uint)config.BufferRingEntries; + PoolMax = config.PoolMax; + MaxConnections = config.MaxConnections; + ConnBufRingEntries = config.ConnBufRingEntries; + IncRecvBufferSize = (uint)config.IncRecvBufferSize; + } + + // ========================================================================= + // Buffer ring + // ========================================================================= + + private void InitBufferRing() + { + nuint ringBytes = (nuint)BufferRingEntries * 16; + _bufRing = (byte*)NativeMemory.AlignedAlloc(ringBytes, 4096); + NativeMemory.Clear(_bufRing, ringBytes); + + nuint slabBytes = BufferRingEntries * (nuint)RecvBufferSize; + _bufSlab = (byte*)NativeMemory.AlignedAlloc(slabBytes, 64); + + _bufRingMask = BufferRingEntries - 1; + + var reg = new io_uring_buf_reg { + ring_addr = (ulong)_bufRing, + ring_entries = BufferRingEntries, + bgid = BgId, + }; + + int ret = io_uring_register(Ring.Fd, IORING_REGISTER_PBUF_RING, ®, 1); + if (ret < 0) + { + int err = Marshal.GetLastPInvokeError(); + throw new InvalidOperationException($"register pbuf_ring failed: ret={ret} errno={err}"); + } + + for (ushort bid = 0; bid < BufferRingEntries; bid++) { + byte* slot = _bufRing + (uint)bid * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)RecvBufferSize); + *(uint*)(slot + 8) = RecvBufferSize; + *(ushort*)(slot + 12) = bid; + } + _bufRingTail = (ushort)BufferRingEntries; + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + + // Thread-safe buf_ring return — callable from any handler thread. + internal void ReturnBufferDirect(ushort bid) + { + bool taken = false; + _bufRingLock.Enter(ref taken); + try + { + byte* slot = _bufRing + (_bufRingTail & _bufRingMask) * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)RecvBufferSize); + *(uint*)(slot + 8) = RecvBufferSize; + *(ushort*)(slot + 12) = bid; + _bufRingTail++; + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + finally { _bufRingLock.Exit(); } + } + + // ========================================================================= + // Cross-thread entry points — all run directly on the calling thread now, + // no MPSC handoff, no eventfd wake. Synchronisation is via the SpinLocks + // inside Ring (for SQ submit) and on _bufRingLock (for buf_ring return). + // ========================================================================= + + public void EnqueueReturnQ(ushort bid) => ReturnBufferDirect(bid); + + internal void EnqueueFlush(Connection conn) + { + SubmitSend(conn.ClientFd, conn.WriteBuffer, (uint)conn.WriteInFlight); + } + + internal void EnqueueRecycle(Connection conn) => Recycle(conn, conn.ClientFd); + + // ========================================================================= + // Main loop + // ========================================================================= + + public void Run() + { + Ring = Ring.Create(_ringEntries); + _listenFd = OpenReusePortListener(_port); + + InitBufferRing(); + + Console.WriteLine($"[r{Id}] listening on 0.0.0.0:{_port}"); + SubmitAcceptMultishot(); + + LoopShared(); + + close(_listenFd); + Ring.Dispose(); + } + + private void LoopShared() + { + while (true) + { + int rc = Ring.WaitForCqe(1); + if (rc < 0 && rc != -EINTR && rc != -EAGAIN && rc != -EBUSY) + { + Console.Error.WriteLine($"[r{Id}] io_uring_enter failed: {rc}"); + break; + } + + uint ready = Ring.CqReady(); + for (uint i = 0; i < ready; i++) + { + Dispatch(in Ring.CqeAt(i)); + } + Ring.CqAdvance(ready); + } + } + + private void Dispatch(in IoUringCqe cqe) + { + ulong kind = cqe.user_data & 0xffffffff_00000000UL; + int fd = (int)(cqe.user_data & 0xffffffffUL); + bool more = (cqe.flags & IORING_CQE_F_MORE) != 0; + + if (kind == KindAccept) + { + if (cqe.res >= 0) + { + int clientFd = cqe.res; + SetNoDelay(clientFd); + Connection conn = _pool.TryPop(out var pooled) + ? pooled.SetFd(clientFd) + : new Connection(this, clientFd, _config.WriteSlabSize); + Connections[clientFd] = conn; + conn.InitRefs(); + SubmitRecvMultishot(clientFd); + + _ = _config.UsePipe + ? Handler.HandlePipeAsync(this, conn) + : Handler.HandleAsync(this, conn); + } + else + { + Console.Error.WriteLine($"[r{Id}] accept error: {cqe.res}"); + } + if (!more) + { + SubmitAcceptMultishot(); + } + } + else if (kind == KindRecv) + { + bool hasBuf = (cqe.flags & IORING_CQE_F_BUFFER) != 0; + ushort bid = hasBuf ? (ushort)(cqe.flags >> IORING_CQE_BUFFER_SHIFT) : (ushort)0; + + if (cqe.res <= 0) + { + if (hasBuf) + { + ReturnBufferDirect(bid); + } + if (Connections.TryRemove(fd, out var dyingConn)) + { + dyingConn.MarkClosed(); + dyingConn.DecRef(); + } + return; + } + + if (!Connections.TryGetValue(fd, out var conn)) + { + if (hasBuf) + { + ReturnBufferDirect(bid); + } + return; + } + + byte* ptr = hasBuf ? _bufSlab + (nuint)bid * (nuint)RecvBufferSize : null; + conn.Complete(cqe.res, bid, hasBuf, ptr); + + if (!more) + { + SubmitRecvMultishot(fd); + } + } + else if (kind == KindSend) + { + if (!Connections.TryGetValue(fd, out var conn)) + { + return; + } + if (cqe.res <= 0) + { + if (Connections.TryRemove(fd, out _)) + { + conn.MarkClosed(); + conn.DecRef(); + } + return; + } + conn.WriteHead += cqe.res; + if (conn.WriteHead < conn.WriteInFlight) + { + SubmitSend(fd, conn.WriteBuffer + conn.WriteHead, (uint)(conn.WriteInFlight - conn.WriteHead)); + return; + } + conn.CompleteFlush(); + } + } + + // ========================================================================= + // SQE producers — thread-safe via Ring's submit SpinLock. Each call is + // one allocate-write-publish cycle, so any thread can submit directly. + // ========================================================================= + + private IoUringSqe* GetSqe() + { + IoUringSqe* sqe = Ring.TryGetSqe(); + if (sqe == null) + { + throw new InvalidOperationException("SQ full"); + } + return sqe; + } + + private void SubmitAcceptMultishot() + { + IoUringSqe* sqe = GetSqe(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_ACCEPT; + sqe->ioprio = IORING_ACCEPT_MULTISHOT; + sqe->fd = _listenFd; + sqe->user_data = KindAccept | (uint)_listenFd; + Ring.PublishSqe(); + } + + private void SubmitRecvMultishot(int fd) => SubmitRecvMultishot(fd, BgId); + + private void SubmitRecvMultishot(int fd, ushort bgid) + { + IoUringSqe* sqe = GetSqe(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_RECV; + sqe->flags = IOSQE_BUFFER_SELECT; + sqe->ioprio = IORING_RECV_MULTISHOT; + sqe->fd = fd; + sqe->buf_index = bgid; + sqe->user_data = KindRecv | (uint)fd; + Ring.PublishSqe(); + } + + private void SubmitSend(int fd, byte* buf, uint len) + { + IoUringSqe* sqe = GetSqe(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_SEND; + sqe->fd = fd; + sqe->addr = (ulong)buf; + sqe->len = len; + sqe->user_data = KindSend | (uint)fd; + Ring.PublishSqe(); + } + + private void Recycle(Connection conn, int fd) + { + conn.MarkClosed(); + conn.DrainRecv(); + close(fd); + conn.Clear(); + + if (_pool.Count < PoolMax) + { + _pool.Push(conn); + } + else + { + conn.Dispose(); + } + } + + private static void SetNoDelay(int fd) + { + int one = 1; + setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(int)); + } + + private static int OpenReusePortListener(ushort port) + { + int fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) + { + throw new InvalidOperationException($"socket failed: {fd}"); + } + + int one = 1; + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(int)); + setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(int)); + + sockaddr_in addr = default; + addr.sin_family = AF_INET; + addr.sin_port = Htons(port); + addr.sin_addr.s_addr = 0; + + if (bind(fd, &addr, (uint)sizeof(sockaddr_in)) < 0) + { + throw new InvalidOperationException("bind failed"); + } + + if (listen(fd, 128) < 0) + { + throw new InvalidOperationException("listen failed"); + } + + return fd; + } +} diff --git a/MinimaSQPoll/ServerConfig.cs b/MinimaSQPoll/ServerConfig.cs new file mode 100644 index 0000000..668dc7a --- /dev/null +++ b/MinimaSQPoll/ServerConfig.cs @@ -0,0 +1,35 @@ +namespace MinimaSQPoll; + +/// +/// All server tunables in one place — replaces the consts that used to be +/// scattered across Program.cs and Reactor.cs. Defaults match the previous +/// hardcoded values; override via object initializer in Main, e.g.: +/// new ServerConfig { Port = 9000, ReactorCount = 8, Incremental = true }. +/// +public sealed record ServerConfig +{ + // Server-level. + public ushort Port { get; init; } = 8080; + public int ReactorCount { get; init; } = 12; + + // Handler style: false = raw ReadAsync/TryGetItem loop; true = PipeReader/PipeWriter. + public bool UsePipe { get; init; } = false; + + // io_uring SQ/CQ depth. + public uint RingEntries { get; init; } = 8192; + + // Shared buffer ring (used when Incremental == false). + public int RecvBufferSize { get; init; } = 32 * 1024; + public int BufferRingEntries { get; init; } = 4096; + + // Per-connection write slab + connection pool cap. + public int WriteSlabSize { get; init; } = 16 * 1024; + public int PoolMax { get; init; } = 1024; + + // Incremental mode (IOU_PBUF_RING_INC) — per-connection rings. + // reserved native memory ≈ PoolMax × ConnBufRingEntries × IncRecvBufferSize × ReactorCount. + public bool Incremental { get; init; } = false; + public int MaxConnections { get; init; } = 4096; // GID cap (one bgid per active connection) + public int ConnBufRingEntries { get; init; } = 16; // buffers per connection ring + public int IncRecvBufferSize { get; init; } = 4096; // bytes per buffer (filled incrementally) +} diff --git a/MinimaSQPoll/Utils/Mpsc.cs b/MinimaSQPoll/Utils/Mpsc.cs new file mode 100644 index 0000000..ece5563 --- /dev/null +++ b/MinimaSQPoll/Utils/Mpsc.cs @@ -0,0 +1,115 @@ +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaSQPoll.Utils; + +/// +/// Bounded lock-free multi-producer / single-consumer queue. +/// +/// Dmitry Vyukov's bounded MPMC algorithm, specialised to one consumer. +/// Power-of-two capacity, zero-allocation after construction. Producers claim a +/// slot via CAS on the enqueue position (a failed TryEnqueue on a full queue +/// leaves the position untouched — no burned tickets); the single consumer +/// advances the dequeue position with a plain write. Each slot carries a +/// sequence number that coordinates ownership between producers and consumer. +/// +/// One generic queue serves every reactor handoff: Mpsc<ushort> for buffer +/// returns, Mpsc<int> for flush fds, Mpsc<ulong> for packed incremental +/// returns. T is unmanaged so each Cell is a blittable value type with no GC refs. +/// +internal sealed class Mpsc where T : unmanaged +{ + private struct Cell + { + public long Sequence; + public T Value; + } + + private readonly Cell[] _buffer; + private readonly int _mask; + + // PaddedLong is a top-level struct (not nested here) because the CLR forbids + // explicit layout on a type nested inside a generic. + private PaddedLong _enqueuePos; + private PaddedLong _dequeuePos; + + public Mpsc(int capacityPow2) + { + if (capacityPow2 < 2 || (capacityPow2 & (capacityPow2 - 1)) != 0) + throw new ArgumentException("Capacity must be a power of two >= 2.", nameof(capacityPow2)); + + _buffer = new Cell[capacityPow2]; + _mask = capacityPow2 - 1; + + for (int i = 0; i < capacityPow2; i++) + _buffer[i].Sequence = i; + } + + /// Multi-producer safe. Returns false if the queue is full. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryEnqueue(T item) + { + Cell[] buffer = _buffer; + int mask = _mask; + + while (true) + { + long pos = Volatile.Read(ref _enqueuePos.Value); + ref Cell cell = ref buffer[(int)pos & mask]; + + long seq = Volatile.Read(ref cell.Sequence); + long dif = seq - pos; + + if (dif == 0) + { + if (Interlocked.CompareExchange(ref _enqueuePos.Value, pos + 1, pos) == pos) + { + cell.Value = item; + Volatile.Write(ref cell.Sequence, pos + 1); + return true; + } + continue; // lost the race; reload and retry + } + + if (dif < 0) + return false; // slot not yet consumed → full + } + } + + /// Single-consumer only. Returns false if empty. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeue(out T item) + { + Cell[] buffer = _buffer; + int mask = _mask; + + long pos = _dequeuePos.Value; // single consumer: plain read + ref Cell cell = ref buffer[(int)pos & mask]; + + long seq = Volatile.Read(ref cell.Sequence); + long dif = seq - (pos + 1); + + if (dif == 0) + { + item = cell.Value; + _dequeuePos.Value = pos + 1; // single consumer: plain write + Volatile.Write(ref cell.Sequence, pos + mask + 1); // free slot for producers + return true; + } + + item = default; + return false; + } +} + +/// +/// A single long padded to a 64-byte cache line so the producer and consumer +/// positions never share a line (no false sharing). Top-level and non-generic +/// so it can legally use explicit layout. +/// +[StructLayout(LayoutKind.Explicit, Size = 64)] +internal struct PaddedLong +{ + [FieldOffset(0)] public long Value; +} diff --git a/MinimaSQPoll/Utils/RingSegment.cs b/MinimaSQPoll/Utils/RingSegment.cs new file mode 100644 index 0000000..034dfdd --- /dev/null +++ b/MinimaSQPoll/Utils/RingSegment.cs @@ -0,0 +1,31 @@ +using System.Buffers; + +namespace MinimaSQPoll.Utils; + +/// +/// One segment of a multi-buffer ReadOnlySequence<byte> built by the +/// ConnectionPipeReader when a single read spans more than one recv buffer. +/// BufferId is carried for debugging; buffer return is driven off the held +/// item list, not the segments. +/// +public sealed class RingSegment : ReadOnlySequenceSegment +{ + public ushort BufferId { get; } + + public RingSegment(ReadOnlyMemory memory, ushort bufferId) + { + Memory = memory; + BufferId = bufferId; + } + + public RingSegment Append(ReadOnlyMemory memory, ushort bufferId) + { + var next = new RingSegment(memory, bufferId) + { + RunningIndex = RunningIndex + Memory.Length + }; + + Next = next; + return next; + } +} diff --git a/MinimaSQPoll/Utils/SpscRecvRing.cs b/MinimaSQPoll/Utils/SpscRecvRing.cs new file mode 100644 index 0000000..376657b --- /dev/null +++ b/MinimaSQPoll/Utils/SpscRecvRing.cs @@ -0,0 +1,105 @@ +using System.Runtime.CompilerServices; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaSQPoll.Utils; + +public sealed unsafe class SpscRecvRing +{ + public struct Item + { + public byte* Ptr; + public ushort Bid; + public int Len; + public bool HasBuffer; + public ushort Gen; // connection generation when enqueued (incremental return guard) + + public ReadOnlySpan AsSpan() => new(Ptr, Len); + + public UnmanagedMemoryManager AsMemoryManager() => new(Ptr, Len, Bid); + } + + private readonly Item[] _items; + private readonly int _mask; + private long _tail; + private long _head; + + public SpscRecvRing(int capacityPow2) + { + if (capacityPow2 <= 0 || (capacityPow2 & (capacityPow2 - 1)) != 0) + { + throw new ArgumentException("capacity must be a power of two", nameof(capacityPow2)); + } + + _items = new Item[capacityPow2]; + _mask = capacityPow2 - 1; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryEnqueue(in Item item) + { + long head = Volatile.Read(ref _head); + long tail = _tail; + + if ((ulong)(tail - head) >= (ulong)_items.Length) + { + return false; + } + + _items[(int)(tail & _mask)] = item; + Volatile.Write(ref _tail, tail + 1); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeue(out Item item) + { + long head = _head; + long tail = Volatile.Read(ref _tail); + + if (head >= tail) + { + item = default; + return false; + } + + item = _items[(int)(head & _mask)]; + Volatile.Write(ref _head, head + 1); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long SnapshotTail() => Volatile.Read(ref _tail); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeueUntil(long tailSnapshot, out Item item) + { + long head = _head; + + if (head >= tailSnapshot) + { + item = default; + return false; + } + + item = _items[(int)(head & _mask)]; + Volatile.Write(ref _head, head + 1); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool IsEmpty() => Volatile.Read(ref _head) >= Volatile.Read(ref _tail); + + // Reactor-thread-only, called during connection teardown (Clear) when no + // handler is consuming. Discards any leftover items so the recycled + // connection starts empty. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Reset() + { + _head = 0; + _tail = 0; + } +} diff --git a/MinimaSQPoll/Utils/UnmanagedMemoryManager.cs b/MinimaSQPoll/Utils/UnmanagedMemoryManager.cs new file mode 100644 index 0000000..af4f39b --- /dev/null +++ b/MinimaSQPoll/Utils/UnmanagedMemoryManager.cs @@ -0,0 +1,32 @@ +using System.Buffers; + +namespace MinimaSQPoll.Utils; + +public sealed unsafe class UnmanagedMemoryManager : MemoryManager +{ + private readonly byte* _ptr; + private readonly int _length; + + public ushort BufferId { get; } + + public UnmanagedMemoryManager(byte* ptr, int length) + { + _ptr = ptr; + _length = length; + } + + public UnmanagedMemoryManager(byte* ptr, int length, ushort bufferId) + { + _ptr = ptr; + _length = length; + BufferId = bufferId; + } + + public override Span GetSpan() => new(_ptr, _length); + + public override MemoryHandle Pin(int elementIndex = 0) => new(_ptr + elementIndex); + + public override void Unpin() { } + + protected override void Dispose(bool disposing) { } +} diff --git a/MinimaSQPoll/io_uring/Native.cs b/MinimaSQPoll/io_uring/Native.cs new file mode 100644 index 0000000..76e8e77 --- /dev/null +++ b/MinimaSQPoll/io_uring/Native.cs @@ -0,0 +1,172 @@ +using System.Runtime.InteropServices; + +namespace MinimaSQPoll; + +/// +/// All native interop in one file: io_uring syscalls, libc socket calls, +/// the kernel struct layouts they expect, and the constants needed to +/// drive a minimal io_uring loop. +/// +public static unsafe class Native { + private const long SYS_IO_URING_SETUP = 425; + private const long SYS_IO_URING_ENTER = 426; + private const long SYS_IO_URING_REGISTER = 427; + + public const byte IORING_OP_POLL_ADD = 6; + public const byte IORING_OP_ACCEPT = 13; + public const byte IORING_OP_SEND = 26; + public const byte IORING_OP_RECV = 27; + public const uint IORING_ENTER_GETEVENTS = 1u << 0; + public const long IORING_OFF_SQ_RING = 0; + public const long IORING_OFF_SQES = 0x10000000; + + // Multishot / buffer-ring goodies. + public const ushort IORING_ACCEPT_MULTISHOT = 1 << 0; + public const ushort IORING_RECV_MULTISHOT = 1 << 1; + public const byte IOSQE_BUFFER_SELECT = 1 << 5; + public const uint IORING_CQE_F_BUFFER = 1u << 0; + public const uint IORING_CQE_F_MORE = 1u << 1; + public const int IORING_CQE_BUFFER_SHIFT = 16; + public const uint IORING_REGISTER_PBUF_RING = 22; + public const uint IORING_UNREGISTER_PBUF_RING = 23; + public const uint IORING_POLL_ADD_MULTI = 1u << 0; + + // Incremental provided-buffer consumption (kernel 6.12+). IOU_PBUF_RING_INC + // is set in io_uring_buf_reg.flags at registration; IORING_CQE_F_BUF_MORE is + // set on recv CQEs while the kernel will keep appending to the same buffer. + public const ushort IOU_PBUF_RING_INC = 2; + public const uint IORING_CQE_F_BUF_MORE = 1u << 4; + + // eventfd flags + poll mask (used for the cross-thread wake mechanism). + public const int EFD_CLOEXEC = 0x80000; + public const int EFD_NONBLOCK = 0x800; + public const uint POLLIN = 0x0001; + + // Setup flags. SINGLE_ISSUER tells the kernel only one thread will submit + // to this ring (skips locking on the SQ). DEFER_TASKRUN defers completion + // processing until io_uring_enter(GETEVENTS), which lets the kernel batch + // work and avoids interrupting the reactor with task_work mid-flight. + public const uint IORING_SETUP_SINGLE_ISSUER = 1u << 12; + public const uint IORING_SETUP_DEFER_TASKRUN = 1u << 13; + + // SQPOLL: kernel spawns a poller thread that reads SQEs from shared memory + // and submits them without us calling io_uring_enter. SQ_AFF pins the poller + // to a specific CPU. After sq_thread_idle ms of inactivity, the poller + // parks and sets SQ_NEED_WAKEUP in sq_flags; we must then call + // io_uring_enter(IORING_ENTER_SQ_WAKEUP) to revive it. + public const uint IORING_SETUP_SQPOLL = 1u << 1; + public const uint IORING_SETUP_SQ_AFF = 1u << 2; + public const uint IORING_SQ_NEED_WAKEUP = 1u << 0; + public const uint IORING_ENTER_SQ_WAKEUP = 1u << 1; + + public const int PROT_READ = 1; + public const int PROT_WRITE = 2; + public const int MAP_SHARED = 1; + public const int MAP_POPULATE = 0x8000; + + public const int AF_INET = 2; + public const int SOCK_STREAM = 1; + public const int SOL_SOCKET = 1; + public const int SO_REUSEADDR = 2; + public const int SO_REUSEPORT = 15; + public const int IPPROTO_TCP = 6; + public const int TCP_NODELAY = 1; + + [DllImport("libc", EntryPoint = "syscall")] + private static extern long syscall3(long nr, uint a1, IoUringParams* a2); + + [DllImport("libc", EntryPoint = "syscall")] + private static extern long syscall6(long nr, uint a1, uint a2, uint a3, uint a4, void* a5, nuint a6); + + [DllImport("libc", EntryPoint = "syscall", SetLastError = true)] + private static extern long syscall4(long nr, uint a1, uint a2, void* a3, uint a4); + + public static int io_uring_setup(uint entries, IoUringParams* p) => + (int)syscall3(SYS_IO_URING_SETUP, entries, p); + + public static int io_uring_enter(int fd, uint toSubmit, uint minComplete, uint flags) => + (int)syscall6(SYS_IO_URING_ENTER, (uint)fd, toSubmit, minComplete, flags, null, 0); + + public static int io_uring_register(int fd, uint opcode, void* arg, uint nrArgs) => + (int)syscall4(SYS_IO_URING_REGISTER, (uint)fd, opcode, arg, nrArgs); + + [DllImport("libc")] public static extern void* mmap(void* addr, nuint length, int prot, int flags, int fd, long offset); + [DllImport("libc")] public static extern int munmap(void* addr, nuint length); + [DllImport("libc")] public static extern int close(int fd); + [DllImport("libc")] public static extern int socket(int domain, int type, int proto); + [DllImport("libc")] public static extern int bind(int fd, sockaddr_in* addr, uint len); + [DllImport("libc")] public static extern int listen(int fd, int backlog); + [DllImport("libc")] public static extern int setsockopt(int fd, int level, int optname, void* optval, uint optlen); + [DllImport("libc")] public static extern int eventfd(uint initval, int flags); + [DllImport("libc")] public static extern long write(int fd, void* buf, nuint count); + [DllImport("libc")] public static extern long read(int fd, void* buf, nuint count); + + public static ushort Htons(ushort x) => (ushort)((x << 8) | (x >> 8)); + + // Kernel struct layouts (must match include/uapi/linux/io_uring.h) + [StructLayout(LayoutKind.Sequential)] + public struct SqRingOffsets { + public uint head, tail, ring_mask, ring_entries, flags, dropped, array, resv1; + public ulong resv2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct CqRingOffsets { + public uint head, tail, ring_mask, ring_entries, overflow, cqes, flags, resv1; + public ulong resv2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct IoUringParams { + public uint sq_entries, cq_entries, flags, sq_thread_cpu, sq_thread_idle; + public uint features, wq_fd, resv0, resv1, resv2; + public SqRingOffsets sq_off; + public CqRingOffsets cq_off; + } + + [StructLayout(LayoutKind.Explicit, Size = 64)] + public struct IoUringSqe { + [FieldOffset(0)] public byte opcode; + [FieldOffset(1)] public byte flags; + [FieldOffset(2)] public ushort ioprio; + [FieldOffset(4)] public int fd; + [FieldOffset(8)] public ulong off; + [FieldOffset(16)] public ulong addr; + [FieldOffset(24)] public uint len; + [FieldOffset(28)] public uint op_flags; + [FieldOffset(32)] public ulong user_data; + [FieldOffset(40)] public ushort buf_index; + [FieldOffset(42)] public ushort personality; + [FieldOffset(44)] public int splice_fd_in; + [FieldOffset(48)] public ulong addr3; + [FieldOffset(56)] public ulong __pad2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct IoUringCqe { + public ulong user_data; + public int res; + public uint flags; + } + + // Argument struct for IORING_REGISTER_PBUF_RING. + [StructLayout(LayoutKind.Sequential)] + public struct io_uring_buf_reg { + public ulong ring_addr; + public uint ring_entries; + public ushort bgid; + public ushort flags; + public ulong resv1, resv2, resv3; + } + + [StructLayout(LayoutKind.Sequential)] + public struct in_addr { public uint s_addr; } + + [StructLayout(LayoutKind.Sequential)] + public unsafe struct sockaddr_in { + public ushort sin_family; + public ushort sin_port; + public in_addr sin_addr; + public fixed byte sin_zero[8]; + } +} diff --git a/MinimaSQPoll/io_uring/Ring.cs b/MinimaSQPoll/io_uring/Ring.cs new file mode 100644 index 0000000..5e9f38c --- /dev/null +++ b/MinimaSQPoll/io_uring/Ring.cs @@ -0,0 +1,232 @@ +using System.Runtime.CompilerServices; +using static MinimaSQPoll.Native; + +// ReSharper disable SuggestVarOrType_BuiltInTypes +// ReSharper disable SuggestVarOrType_Elsewhere +#pragma warning disable CA1806 + +namespace MinimaSQPoll; + +public sealed unsafe class Ring : IDisposable +{ + private int _fd; + + public int Fd => _fd; + + private uint* _sqHead; + private uint* _sqTail; + private uint* _sqArray; + private uint* _sqFlags; // kernel-shared SQ flags (carries IORING_SQ_NEED_WAKEUP under SQPOLL) + private uint _sqMask; + private uint _sqEntries; + private IoUringSqe* _sqes; + private bool _sqPoll; + + private uint* _cqHead; + private uint* _cqTail; + private IoUringCqe* _cqes; + private uint _cqMask; + + private uint _sqeTail; + + // Guards SQE allocation + publish so any thread can submit. Critical section + // is tiny (write 64 B, advance tail), so a SpinLock comfortably outperforms + // Monitor and there is no scheduler interaction. + private SpinLock _submitLock = new SpinLock(false); + + private byte* _ringPtr; + private nuint _ringSize; + private byte* _sqePtr; + private nuint _sqeSize; + + public static Ring Create(uint entries, bool sqPoll = true, uint sqIdleMs = 1000, int sqCpu = -1) + { + IoUringParams ioUringParams = default; + if (sqPoll) + { + // SQPOLL: kernel poller thread reads SQEs from shared memory and + // submits them without us calling io_uring_enter. Incompatible with + // SINGLE_ISSUER/DEFER_TASKRUN (the poller is the "submitter" from + // the kernel's perspective). + ioUringParams.flags = IORING_SETUP_SQPOLL; + ioUringParams.sq_thread_idle = sqIdleMs; + if (sqCpu >= 0) + { + ioUringParams.flags |= IORING_SETUP_SQ_AFF; + ioUringParams.sq_thread_cpu = (uint)sqCpu; + } + } + else + { + ioUringParams.flags = IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN; + } + int fd = io_uring_setup(entries, &ioUringParams); + if (fd < 0) + { + throw new InvalidOperationException($"io_uring_setup failed: {fd}"); + } + + var ring = new Ring + { + _fd = fd, + _sqEntries = ioUringParams.sq_entries, + _sqPoll = sqPoll, + }; + + nuint sqRingBytes = ioUringParams.sq_off.array + ioUringParams.sq_entries * sizeof(uint); + nuint cqRingBytes = ioUringParams.cq_off.cqes + ioUringParams.cq_entries * (nuint)sizeof(IoUringCqe); + nuint ringBytes = sqRingBytes > cqRingBytes ? sqRingBytes : cqRingBytes; + + void* ringMem = mmap(null, ringBytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); + if (ringMem == (void*)-1) + { + close(fd); + + throw new InvalidOperationException("mmap(SQ_RING) failed"); + } + ring._ringPtr = (byte*)ringMem; + ring._ringSize = ringBytes; + + nuint sqeBytes = ioUringParams.sq_entries * (nuint)sizeof(IoUringSqe); + void* sqeMem = mmap(null, sqeBytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); + if (sqeMem == (void*)-1) + { + munmap(ringMem, ringBytes); + close(fd); + + throw new InvalidOperationException("mmap(SQES) failed"); + } + ring._sqes = (IoUringSqe*)sqeMem; + ring._sqePtr = (byte*)sqeMem; + ring._sqeSize = sqeBytes; + + byte* ringPointer = (byte*)ringMem; + ring._sqHead = (uint*)(ringPointer + ioUringParams.sq_off.head); + ring._sqTail = (uint*)(ringPointer + ioUringParams.sq_off.tail); + ring._sqArray = (uint*)(ringPointer + ioUringParams.sq_off.array); + ring._sqFlags = (uint*)(ringPointer + ioUringParams.sq_off.flags); + ring._sqMask = *(uint*)(ringPointer + ioUringParams.sq_off.ring_mask); + + ring._cqHead = (uint*)(ringPointer + ioUringParams.cq_off.head); + ring._cqTail = (uint*)(ringPointer + ioUringParams.cq_off.tail); + ring._cqes = (IoUringCqe*)(ringPointer + ioUringParams.cq_off.cqes); + ring._cqMask = *(uint*)(ringPointer + ioUringParams.cq_off.ring_mask); + + return ring; + } + + // Thread-safe SQE allocation. The lock is held until PublishSqe() is called, + // so callers must always call PublishSqe (or be on the reactor with no other + // threads submitting). Pattern is: sqe = TryGetSqe(); write fields; PublishSqe(). + public IoUringSqe* TryGetSqe() + { + bool taken = false; + _submitLock.Enter(ref taken); + + uint head = Volatile.Read(ref *_sqHead); + if (_sqeTail - head >= _sqEntries) + { + _submitLock.Exit(); + return null; + } + + uint slot = _sqeTail & _sqMask; + _sqArray[slot] = slot; + _sqeTail++; + + return &_sqes[slot]; + } + + // Publishes the SQE the caller just wrote to the kernel-visible tail and + // releases the submit lock. Under SQPOLL this also wakes the poller if it + // has parked (SQ_NEED_WAKEUP set). + public void PublishSqe() + { + Volatile.Write(ref *_sqTail, _sqeTail); + + if (_sqPoll && (Volatile.Read(ref *_sqFlags) & IORING_SQ_NEED_WAKEUP) != 0) + { + io_uring_enter(_fd, 0, 0, IORING_ENTER_SQ_WAKEUP); + } + + _submitLock.Exit(); + } + + // Block waiting for at least waitFor CQEs. With direct submission (handlers + // call TryGetSqe/PublishSqe), the reactor only ever needs to wait here — + // no submit work to coordinate. Under non-SQPOLL we still need to submit + // any pending SQEs along with the wait. + public int WaitForCqe(uint waitFor) + { + if (_sqPoll) + { + if (waitFor == 0) return 0; + return io_uring_enter(_fd, 0, waitFor, IORING_ENTER_GETEVENTS); + } + + // Non-SQPOLL fallback: submit + wait in one syscall. + uint published = *_sqTail; + uint toSubmit = _sqeTail - published; + if (toSubmit > 0) + { + Volatile.Write(ref *_sqTail, _sqeTail); + } + if (toSubmit == 0 && waitFor == 0) return 0; + uint enterFlags = waitFor > 0 ? IORING_ENTER_GETEVENTS : 0; + return io_uring_enter(_fd, toSubmit, waitFor, enterFlags); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryGetCqe(out IoUringCqe cqe) + { + uint head = *_cqHead; + uint tail = Volatile.Read(ref *_cqTail); + + if (head == tail) + { + cqe = default; + + return false; + } + + cqe = _cqes[head & _cqMask]; + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void CqeSeen() => Volatile.Write(ref *_cqHead, *_cqHead + 1); + + // Batched CQ drain (liburing io_uring_for_each_cqe + io_uring_cq_advance): + // read the kernel-written tail once (acquire), process the whole batch, + // then publish the consumed head once (release) instead of once per CQE. + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint CqReady() => Volatile.Read(ref *_cqTail) - *_cqHead; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ref readonly IoUringCqe CqeAt(uint i) => ref _cqes[(*_cqHead + i) & _cqMask]; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void CqAdvance(uint n) => Volatile.Write(ref *_cqHead, *_cqHead + n); + + public void Dispose() + { + if (_ringPtr != null) + { + munmap(_ringPtr, _ringSize); _ringPtr = null; + } + + if (_sqePtr != null) + { + munmap(_sqePtr, _sqeSize); _sqePtr = null; + } + + if (_fd > 0) + { + close(_fd); _fd = 0; + } + } +} + +#pragma warning restore CA1806 diff --git a/MinimaTFlow/Connection/Connection.Read.cs b/MinimaTFlow/Connection/Connection.Read.cs new file mode 100644 index 0000000..c27f1e2 --- /dev/null +++ b/MinimaTFlow/Connection/Connection.Read.cs @@ -0,0 +1,168 @@ +using System.Threading.Tasks.Sources; +using MinimaTFlow.Utils; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTFlow; + +/// +/// Per-connection state. The handler may run on any thread (e.g. resumed by +/// a thread-pool timer); reactor-only side effects are funnelled through the +/// MPSC queues on `Reactor`. Coordination uses Interlocked.Exchange on the +/// arm flags and a sticky `_pending` to close the lost-wakeup race. +/// +/// Lifetime is pool-managed: the reactor pops a Connection on accept (or new +/// one if pool is empty), and pushes it back on teardown after `Clear()`. The +/// `_generation` field is bumped on each `Clear` so stale `ValueTask` tokens +/// from a previous connection life are detectable and return `Closed()` +/// instead of leaking the new tenant's state. +/// +public sealed unsafe partial class Connection : IValueTaskSource +{ + internal Connection SetFd(int fd) + { + ClientFd = fd; + return this; + } + + // Async continuations: handler resumes on the thread pool so libc send() + // in FlushAsync never blocks the reactor thread. + private ManualResetValueTaskSourceCore _readSignal = new() + { + RunContinuationsAsynchronously = true, + }; + private int _armed; + private int _pending; + private int _closed; + + private readonly SpscRecvRing _recv = new(capacityPow2: 16); + + public ValueTask ReadAsync() + { + if (!_recv.IsEmpty() || Volatile.Read(ref _pending) == 1) + { + Volatile.Write(ref _pending, 0); + return new ValueTask( + new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + + if (Volatile.Read(ref _closed) != 0) + { + return new ValueTask(RecvSnapshot.Closed()); + } + + if (Interlocked.Exchange(ref _armed, 1) == 1) + { + throw new InvalidOperationException("ReadAsync already armed."); + } + + // Snapshot the generation as the IVTS token so a future Clear() can + // invalidate this awaiter if the connection gets pool-recycled. + int gen = Volatile.Read(ref _generation); + + // Race recovery: re-check between arming and returning the IVTS task. + if (!_recv.IsEmpty() || Volatile.Read(ref _pending) == 1 || Volatile.Read(ref _closed) != 0) + { + Volatile.Write(ref _pending, 0); + Interlocked.Exchange(ref _armed, 0); + + return new ValueTask( + new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + + return new ValueTask(this, (short)gen); + } + + public bool TryGetItem(in RecvSnapshot snap, out SpscRecvRing.Item item) + => _recv.TryDequeueUntil(snap.Tail, out item); + + public void ResetRead() => _readSignal.Reset(); + + public void Complete(int res, ushort bid, bool hasBuffer, byte* ptr) + { + if (!_recv.TryEnqueue(new SpscRecvRing.Item + { + Ptr = ptr, + Bid = bid, + Len = res, + HasBuffer = hasBuffer, + Gen = (ushort)Volatile.Read(ref _generation) + })) + { + Console.Error.WriteLine("[conn] recv queue overflow."); + if (hasBuffer) + { + _reactor.ReturnBufferDirect(bid); + } + Volatile.Write(ref _closed, 1); + } + + if (Interlocked.Exchange(ref _armed, 0) == 1) + { + _readSignal.SetResult(new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + else + { + Volatile.Write(ref _pending, 1); + } + } + + internal void DrainRecv() + { + // Return any buffer IDs still sitting in the SPSC ring (handler exited + // before draining them, or a recv arrived after _closed was set). + while (_recv.TryDequeue(out SpscRecvRing.Item item)) + { + if (item.HasBuffer) + { + _reactor.ReturnBufferDirect(item.Bid); + } + } + } + + // ========================================================================= + // IValueTaskSource plumbing — token (= snapshot of `_generation` at await + // time) is compared against the current `_generation` to detect stale + // awaiters from before a Clear()/pool reuse. Stale awaiters get a + // sentinel result rather than the new tenant's state. + // + // For the actual IVTS dispatch we pass `_readSignal.Version` / + // `_flushSignal.Version` to the underlying core (not `token`) because the + // core's version is bumped by ResetRead/CompleteFlush mid-life and is + // unrelated to the cross-life generation guard. + // ========================================================================= + + RecvSnapshot IValueTaskSource.GetResult(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return RecvSnapshot.Closed(); + } + + return _readSignal.GetResult(_readSignal.Version); + } + + ValueTaskSourceStatus IValueTaskSource.GetStatus(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return ValueTaskSourceStatus.Succeeded; + } + + return _readSignal.GetStatus(_readSignal.Version); + } + + void IValueTaskSource.OnCompleted(Action continuation, object? state, short token, ValueTaskSourceOnCompletedFlags flags) + { + if (token != (short)Volatile.Read(ref _generation)) + { + // Stale — run the continuation now so the awaiter unblocks and + // gets RecvSnapshot.Closed() from GetResult. + continuation(state); + + return; + } + + _readSignal.OnCompleted(continuation, state, _readSignal.Version, flags); + } +} diff --git a/MinimaTFlow/Connection/Connection.Write.cs b/MinimaTFlow/Connection/Connection.Write.cs new file mode 100644 index 0000000..5ebf966 --- /dev/null +++ b/MinimaTFlow/Connection/Connection.Write.cs @@ -0,0 +1,137 @@ +using System.Buffers; +using System.Runtime.InteropServices; +using MinimaTFlow.Utils; +using static MinimaTFlow.Native; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTFlow; + +/// +/// Twinflow-style write path: handler thread calls libc send() directly, +/// keeping the io_uring reactor on the recv side only. No flush IVTS, no MPSC +/// hand-off, no send CQE — the response goes straight to the kernel via a +/// single syscall on whichever thread the handler is running on. +/// +public sealed unsafe partial class Connection : IBufferWriter +{ + private readonly int _writeSlabSize; + internal byte* WriteBuffer; + internal int WriteTail; + + private readonly UnmanagedMemoryManager _manager; + + // IBufferWriter +#region IBufferWriter + + public Memory GetMemory(int sizeHint = 0) + { + int remaining = _writeSlabSize - WriteTail; + if (sizeHint > remaining) + { + throw new InvalidOperationException("Buffer too small."); + } + return _manager.Memory.Slice(WriteTail, remaining); + } + + public Span GetSpan(int sizeHint = 0) + { + if (WriteTail + sizeHint > _writeSlabSize) + { + throw new InvalidOperationException("Write buffer too small."); + } + return new Span(WriteBuffer + WriteTail, _writeSlabSize - WriteTail); + } + + public void Advance(int count) => WriteTail += count; + +#endregion + + public void Write(ReadOnlySpan source) + { + int len = source.Length; + if (WriteTail + len > _writeSlabSize) + { + throw new InvalidOperationException("Write buffer too small."); + } + source.CopyTo(new Span(WriteBuffer + WriteTail, len)); + WriteTail += len; + } + + /// + /// Synchronously send everything we've buffered via libc send(). + /// Returns a completed ValueTask in the common case; on EAGAIN, spin-yields + /// the thread until the kernel send buffer drains. No reactor handoff, no + /// IVTS — the syscall happens on the handler thread. + /// + /// Async fallback for EAGAIN is omitted because the class is `unsafe` and + /// C# disallows `await` in unsafe context. For HTTP/1.1 plaintext on + /// loopback EAGAIN is essentially never hit; if you serve large bodies, + /// extract the slow path to a non-unsafe helper. + /// + public ValueTask FlushAsync() + { + if (Volatile.Read(ref _closed) == 1) + { + return default; + } + + int target = WriteTail; + if (target == 0) + { + return default; + } + + int off = 0; + while (off < target) + { + int sent = TrySend(WriteBuffer + off, (uint)(target - off), out bool wouldBlock, out bool closed); + if (closed) + { + MarkClosed(); + WriteTail = 0; + return default; + } + if (sent > 0) + { + off += sent; + continue; + } + if (wouldBlock) + { + if (Volatile.Read(ref _closed) == 1) + { + WriteTail = 0; + return default; + } + Thread.Yield(); + } + } + + WriteTail = 0; + return default; + } + + private int TrySend(byte* buf, uint len, out bool wouldBlock, out bool closed) + { + wouldBlock = false; + closed = false; + long n = send(ClientFd, buf, len, MSG_NOSIGNAL); + if (n > 0) + { + return (int)n; + } + int err = (n == 0) ? EAGAIN : Marshal.GetLastPInvokeError(); + if (err is EAGAIN or EWOULDBLOCK) + { + wouldBlock = true; + return 0; + } + if (err == EINTR) + { + return 0; + } + closed = true; + return 0; + } +} diff --git a/MinimaTFlow/Connection/Connection.cs b/MinimaTFlow/Connection/Connection.cs new file mode 100644 index 0000000..fe5b112 --- /dev/null +++ b/MinimaTFlow/Connection/Connection.cs @@ -0,0 +1,94 @@ +using System.Runtime.InteropServices; +using MinimaTFlow.Utils; + +namespace MinimaTFlow; + +public sealed unsafe partial class Connection +{ + private readonly Reactor _reactor; + + public int ClientFd { get; private set; } + + // Bumped on Clear(); the low 16 bits are used as the IVTS token so stale + // awaiters can be detected after pool reuse. + private int _generation; + + // Refcount: the connection has two owners — the reactor (recv side) and the + // handler (which may run off-reactor). Init to 2 on accept; each owner DecRef's + // when done; teardown (Recycle) runs only at refs==0, so a connection is never + // recycled or pool-reused while a handler is still in flight on another thread. + private int _refs; + + public Connection(Reactor reactor, int fd, int writeSlabSize = 1024 * 16) + { + _reactor = reactor; + ClientFd = fd; + _writeSlabSize = writeSlabSize; + WriteBuffer = (byte*)NativeMemory.AlignedAlloc((nuint)writeSlabSize, 64); + + _manager = new UnmanagedMemoryManager(WriteBuffer, writeSlabSize); + } + + // ========================================================================= + // Pool lifecycle — invoked from Reactor.Dispatch's recv/send error paths. + // Reactor-thread only. + // + // teardown: MarkClosed() → wake awaiters with closed=1 + // DrainRecv() → return any in-flight buf_ring items + // close(fd) + // Clear() → reset state, bump _generation + // push to pool, OR Dispose() if pool is full + // ========================================================================= + + public void MarkClosed() + { + Volatile.Write(ref _closed, 1); + + if (Interlocked.Exchange(ref _armed, 0) == 1) + { + _readSignal.SetResult(new RecvSnapshot(_recv.SnapshotTail(), isClosed: true)); + } + else + { + Volatile.Write(ref _pending, 1); + } + } + + // Init to 2 (reactor + handler) at accept. + internal void InitRefs() => Volatile.Write(ref _refs, 2); + + // Release one owner's ref. Whoever drives it to 0 hands the connection to the + // reactor for teardown (close + Clear + pool) — never recycled before both done. + internal void DecRef() + { + if (Interlocked.Decrement(ref _refs) == 0) + { + _reactor.EnqueueRecycle(this); + } + } + + internal void Clear() + { + Interlocked.Increment(ref _generation); + + Volatile.Write(ref _armed, 0); + Volatile.Write(ref _pending, 0); + Volatile.Write(ref _closed, 0); + + WriteTail = 0; + + _readSignal.Reset(); + _recv.Reset(); + } + + public void Dispose() + { + if (WriteBuffer != null) + { + NativeMemory.AlignedFree(WriteBuffer); + WriteBuffer = null; + } + } + + public void ReturnBuffer(in SpscRecvRing.Item item) => _reactor.EnqueueReturnQ(item.Bid); +} \ No newline at end of file diff --git a/MinimaTFlow/Connection/ConnectionDualPipe.cs b/MinimaTFlow/Connection/ConnectionDualPipe.cs new file mode 100644 index 0000000..90682b5 --- /dev/null +++ b/MinimaTFlow/Connection/ConnectionDualPipe.cs @@ -0,0 +1,16 @@ +using System.IO.Pipelines; + +namespace MinimaTFlow; + +public sealed class ConnectionDualPipe : IDuplexPipe +{ + public PipeReader Input { get; } + public PipeWriter Output { get; } + + public ConnectionDualPipe(Connection connection) + { + ArgumentNullException.ThrowIfNull(connection); + Input = new ConnectionPipeReader(connection); + Output = new ConnectionPipeWriter(connection); + } +} \ No newline at end of file diff --git a/MinimaTFlow/Connection/ConnectionPipeReader.cs b/MinimaTFlow/Connection/ConnectionPipeReader.cs new file mode 100644 index 0000000..ed744ec --- /dev/null +++ b/MinimaTFlow/Connection/ConnectionPipeReader.cs @@ -0,0 +1,181 @@ +using System.Buffers; +using System.IO.Pipelines; +using MinimaTFlow.Utils; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTFlow; + +/// +/// Adapts Minima's raw read API (ReadAsync + TryGetItem +/// + ReturnBuffer) to a standard . Recv buffers are +/// exposed zero-copy as a ReadOnlySequence<byte> (one segment per buffer) +/// and held until AdvanceTo consumes them, at which point fully-consumed buffers +/// are returned to the reactor. +/// +/// Convenience/compat layer for PipeReader consumers — the raw ReadAsync/ +/// TryGetItem path stays the faster one (this adds held-buffer + sequence +/// bookkeeping per read). +/// +public sealed class ConnectionPipeReader : PipeReader +{ + private readonly Connection _conn; + private readonly List _held = new(16); + private ReadOnlySequence _lastSequence; + + private bool _completed; + private bool _cancelRequested; + private bool _connectionClosed; + + private readonly struct Held + { + public readonly ReadOnlyMemory Memory; + public readonly SpscRecvRing.Item Item; + + public Held(ReadOnlyMemory memory, SpscRecvRing.Item item) + { + Memory = memory; + Item = item; + } + + public Held WithMemory(ReadOnlyMemory memory) => new(memory, Item); + } + + public ConnectionPipeReader(Connection connection) + { + _conn = connection ?? throw new ArgumentNullException(nameof(connection)); + } + + public override async ValueTask ReadAsync(CancellationToken cancellationToken = default) + { + ThrowIfCompleted(); + + if (_cancelRequested) + { + _cancelRequested = false; + return new ReadResult(BuildSequence(), isCanceled: true, isCompleted: _connectionClosed); + } + + // Anything still held from a previous read that wasn't fully consumed. + if (_held.Count > 0) + return new ReadResult(BuildSequence(), isCanceled: false, isCompleted: _connectionClosed); + + if (_connectionClosed) + return new ReadResult(default, isCanceled: false, isCompleted: true); + + RecvSnapshot snap = await _conn.ReadAsync(); + + while (_conn.TryGetItem(snap, out SpscRecvRing.Item item)) + { + if (item.HasBuffer) + _held.Add(new Held(item.AsMemoryManager().Memory, item)); + } + + _conn.ResetRead(); + + if (snap.IsClosed) + _connectionClosed = true; + + if (_cancelRequested) + { + _cancelRequested = false; + return new ReadResult(BuildSequence(), isCanceled: true, isCompleted: _connectionClosed); + } + + return new ReadResult(BuildSequence(), isCanceled: false, isCompleted: _connectionClosed); + } + + public override bool TryRead(out ReadResult result) + { + ThrowIfCompleted(); + + if (_held.Count > 0) + { + result = new ReadResult(BuildSequence(), isCanceled: false, isCompleted: _connectionClosed); + return true; + } + + if (_connectionClosed) + { + result = new ReadResult(default, isCanceled: false, isCompleted: true); + return true; + } + + result = default; + return false; + } + + public override void AdvanceTo(SequencePosition consumed) => AdvanceTo(consumed, consumed); + + public override void AdvanceTo(SequencePosition consumed, SequencePosition examined) + { + if (_held.Count == 0) + return; + + long consumedBytes = _lastSequence.Slice(0, consumed).Length; + + while (_held.Count > 0 && consumedBytes > 0) + { + Held seg = _held[0]; + int available = seg.Memory.Length; + + if (consumedBytes >= available) + { + // Whole buffer consumed — return it to the reactor. + _conn.ReturnBuffer(seg.Item); + _held.RemoveAt(0); + consumedBytes -= available; + } + else + { + // Partial — keep the unconsumed tail of this buffer. + _held[0] = seg.WithMemory(seg.Memory[(int)consumedBytes..]); + consumedBytes = 0; + } + } + } + + public override void CancelPendingRead() => _cancelRequested = true; + + public override void Complete(Exception? exception = null) + { + if (_completed) + return; + + _completed = true; + + for (int i = 0; i < _held.Count; i++) + _conn.ReturnBuffer(_held[i].Item); + + _held.Clear(); + } + + private ReadOnlySequence BuildSequence() + { + if (_held.Count == 0) + { + _lastSequence = default; + return _lastSequence; + } + + if (_held.Count == 1) + { + _lastSequence = new ReadOnlySequence(_held[0].Memory); + return _lastSequence; + } + + var head = new RingSegment(_held[0].Memory, _held[0].Item.Bid); + RingSegment tail = head; + + for (int i = 1; i < _held.Count; i++) + tail = tail.Append(_held[i].Memory, _held[i].Item.Bid); + + _lastSequence = new ReadOnlySequence(head, 0, tail, tail.Memory.Length); + return _lastSequence; + } + + private void ThrowIfCompleted() + { + if (_completed) + throw new InvalidOperationException("Reading is not allowed after the reader was completed."); + } +} diff --git a/MinimaTFlow/Connection/ConnectionPipeWriter.cs b/MinimaTFlow/Connection/ConnectionPipeWriter.cs new file mode 100644 index 0000000..a6a41eb --- /dev/null +++ b/MinimaTFlow/Connection/ConnectionPipeWriter.cs @@ -0,0 +1,63 @@ +using System.IO.Pipelines; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTFlow; + +/// +/// Adapts Minima's write API (GetMemory/GetSpan/Advance/ +/// FlushAsync) to a standard , so PipeWriter-based code +/// can write responses through the connection's per-connection slab. +/// A thin wrapper — all the work lives in Connection. +/// +public sealed class ConnectionPipeWriter : PipeWriter +{ + private readonly Connection _conn; + private bool _completed; + private bool _cancelRequested; + private long _unflushed; + + public ConnectionPipeWriter(Connection connection) + { + _conn = connection ?? throw new ArgumentNullException(nameof(connection)); + } + + public override bool CanGetUnflushedBytes => true; + public override long UnflushedBytes => _unflushed; + + public override Memory GetMemory(int sizeHint = 0) => _conn.GetMemory(sizeHint); + + public override Span GetSpan(int sizeHint = 0) => _conn.GetSpan(sizeHint); + + public override void Advance(int bytes) + { + _unflushed += bytes; + _conn.Advance(bytes); + } + + public override ValueTask FlushAsync(CancellationToken cancellationToken = default) + { + if (_cancelRequested) + { + _cancelRequested = false; + return new ValueTask(new FlushResult(isCanceled: true, isCompleted: _completed)); + } + + _unflushed = 0; + ValueTask inner = _conn.FlushAsync(); + + if (inner.IsCompletedSuccessfully) + return new ValueTask(new FlushResult(isCanceled: false, isCompleted: _completed)); + + return AwaitFlush(inner); + } + + private async ValueTask AwaitFlush(ValueTask inner) + { + await inner; + return new FlushResult(isCanceled: false, isCompleted: _completed); + } + + public override void CancelPendingFlush() => _cancelRequested = true; + + public override void Complete(Exception? exception = null) => _completed = true; +} diff --git a/MinimaTFlow/Connection/RecvSnapshot.cs b/MinimaTFlow/Connection/RecvSnapshot.cs new file mode 100644 index 0000000..e6daeea --- /dev/null +++ b/MinimaTFlow/Connection/RecvSnapshot.cs @@ -0,0 +1,15 @@ +namespace MinimaTFlow; + +public readonly struct RecvSnapshot +{ + public readonly long Tail; + public readonly bool IsClosed; + + public RecvSnapshot(long tail, bool isClosed) + { + Tail = tail; + IsClosed = isClosed; + } + + public static RecvSnapshot Closed() => new(0, isClosed: true); +} \ No newline at end of file diff --git a/MinimaTFlow/MinimaTFlow.csproj b/MinimaTFlow/MinimaTFlow.csproj new file mode 100644 index 0000000..e6699cd --- /dev/null +++ b/MinimaTFlow/MinimaTFlow.csproj @@ -0,0 +1,12 @@ + + + + Exe + net10.0 + enable + enable + true + MinimaTFlow + + + diff --git a/MinimaTFlow/Program.cs b/MinimaTFlow/Program.cs new file mode 100644 index 0000000..78b987e --- /dev/null +++ b/MinimaTFlow/Program.cs @@ -0,0 +1,178 @@ +using System.Buffers; +using System.IO.Pipelines; +using System.Text.Json; +using MinimaTFlow.Utils; + +namespace MinimaTFlow; + +/// +/// Multi-reactor HTTP/1.1 server using io_uring directly. Spawns N reactor +/// threads (one per CPU); each opens its own SO_REUSEPORT listener, runs its +/// own io_uring, owns its own connection map. The kernel load-balances new +/// connections across reactors. Per-connection state never crosses threads, +/// so no synchronization is needed on the hot path. +/// +internal static unsafe class Program +{ + internal static ReadOnlySpan Response => + "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nContent-Length: 2\r\n\r\nok"u8; + + private static int Main() + { + // All tunables live in ServerConfig — override the defaults here. + var config = new ServerConfig() + { + UsePipe = false, + ReactorCount = 8 + }; + + Console.WriteLine($"[Minima] starting {config.ReactorCount} reactors on port {config.Port} (incremental={config.Incremental})"); + + var threads = new Thread[config.ReactorCount]; + for (var i = 0; i < config.ReactorCount; i++) + { + var reactor = new Reactor(i, config); + + threads[i] = new Thread(reactor.Run) + { + Name = $"reactor-{i}", + IsBackground = false + }; + threads[i].Start(); + } + + foreach (var t in threads) + { + t.Join(); + } + + return 0; + } +} + +internal static class Handler +{ + // Real async-work knob: serialize an in-memory object of WORK_ITEMS elements to JSON + // on the THREAD POOL (via Task.Run) per request. 0 / unset = disabled (pure inline + // reactor path). Genuine CPU + allocation, not a busy-spin. + private static readonly int WorkItems = 50; + + private static readonly Payload LargeObject = BuildPayload(Math.Max(WorkItems, 1)); + + private static Payload BuildPayload(int count) + { + var items = new Item[count]; + for (int i = 0; i < count; i++) + { + items[i] = new Item(i, $"item-{i}", i * 1.5, (i & 1) == 0, $"category-{i % 8}"); + } + return new Payload(DateTime.UtcNow.ToString("O"), count, items); + } + + public static async Task HandleAsync(Reactor reactor, Connection conn) + { + try + { + while (true) + { + RecvSnapshot snap = await conn.ReadAsync(); + + while (conn.TryGetItem(snap, out SpscRecvRing.Item item)) + { + if (item.HasBuffer) + { + UnmanagedMemoryManager mem = item.AsMemoryManager(); + ReadOnlyMemory data = mem.Memory; + // data is now usable with any BCL Memory/async API + _ = data.Length; + + // Cross-thread safe and mode-agnostic: routes to the + // shared-ring return or the incremental refcounted return. + conn.ReturnBuffer(in item); + } + } + + _ = await Task.Run(static () => JsonSerializer.Serialize("Hello World!")); + + // Real async work: serialize a large object to JSON on the THREAD POOL. + // The handler resumes OFF-REACTOR, so the FlushAsync below pays the eventfd + // handoff the pure-inline path avoids — and the serialization is genuine + // CPU + GC pressure on the pool, not a busy-spin. + /*if (WorkItems > 0) + { + //_ = await Task.Run(static () => JsonSerializer.SerializeToUtf8Bytes(LargeObject)); + JsonSerializer.SerializeToUtf8Bytes(LargeObject); + }*/ + + // One response per recv burst — accumulate in the connection's + // per-connection write slab, then submit and await ack. + conn.Write(Program.Response); + await conn.FlushAsync(); + + if (snap.IsClosed) + { + // Reactor already owns teardown (Connections.Remove + close + // happens in Dispatch's recv-error branch); we just exit. + return; + } + + conn.ResetRead(); + } + } + catch (Exception ex) + { + Console.Error.WriteLine($"[r{reactor.Id}] handler crash on fd={conn.ClientFd}: {ex}"); + // Reactor will clean the connection up via the recv-error path + // (or SPSC overflow) on the next CQE for this fd. + } + finally + { + conn.DecRef(); // release the handler's ref; teardown runs once the reactor releases too + } + } + + // PipeReader/PipeWriter variant — same behavior, driven through the BCL + // pipe adapters instead of the raw ReadAsync/TryGetItem/Write API. + public static async Task HandlePipeAsync(Reactor reactor, Connection conn) + { + var reader = new ConnectionPipeReader(conn); + var writer = new ConnectionPipeWriter(conn); + + try + { + while (true) + { + ReadResult read = await reader.ReadAsync(); + ReadOnlySequence buffer = read.Buffer; + + if (!buffer.IsEmpty) + { + // A real server would parse requests out of `buffer` here. + writer.Write(Program.Response); + await writer.FlushAsync(); + } + + // Consume everything we got; AdvanceTo returns the recv buffers. + reader.AdvanceTo(buffer.End); + + if (read.IsCompleted) + { + break; + } + } + } + catch (Exception ex) + { + Console.Error.WriteLine($"[r{reactor.Id}] pipe handler crash on fd={conn.ClientFd}: {ex}"); + } + finally + { + reader.Complete(); + writer.Complete(); + conn.DecRef(); + } + } +} + +internal sealed record Item(int Id, string Name, double Value, bool Active, string Category); +internal sealed record Payload(string Generated, int Count, Item[] Items); diff --git a/MinimaTFlow/Reactor/Reactor.cs b/MinimaTFlow/Reactor/Reactor.cs new file mode 100644 index 0000000..7af8ce7 --- /dev/null +++ b/MinimaTFlow/Reactor/Reactor.cs @@ -0,0 +1,370 @@ +using System.Collections.Concurrent; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using MinimaTFlow.Utils; +using static MinimaTFlow.Native; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTFlow; + +/// +/// Recv-only io_uring reactor. Accept + multishot recv flow through the ring; +/// the response goes out via libc send() on the handler thread +/// (Twinflow-style), so the reactor never deals with sends. Cross-thread paths +/// (buffer-return, recycle) still go through the MPSC + eventfd-wake pattern. +/// +public sealed unsafe partial class Reactor +{ + public readonly int Id; + public Ring Ring = null!; + public readonly Dictionary Connections = new(); + + private int _listenFd; + private readonly ServerConfig _config; + private readonly ushort _port; + private readonly uint _ringEntries; + private readonly uint RecvBufferSize; + + private const ulong KindAccept = 1UL << 32; + private const ulong KindRecv = 2UL << 32; + private const ulong KindWake = 4UL << 32; + + private const ushort BgId = 1; + private readonly uint BufferRingEntries; + private byte* _bufRing; + private byte* _bufSlab; + private uint _bufRingMask; + private ushort _bufRingTail; + + private int _wakeFd; + private int _reactorThreadId; + private readonly Mpsc _returnQ = new(1 << 14); + private readonly ConcurrentQueue _recycleQ = new(); + + private readonly int PoolMax; + private readonly Stack _pool; + + private const int EINTR = 4; + private const int EAGAIN = 11; + private const int EBUSY = 16; + + public Reactor(int id, ServerConfig config) + { + Id = id; + _config = config; + _port = config.Port; + _ringEntries = config.RingEntries; + RecvBufferSize = (uint)config.RecvBufferSize; + BufferRingEntries = (uint)config.BufferRingEntries; + PoolMax = config.PoolMax; + _pool = new Stack(config.PoolMax); + } + + // ========================================================================= + // Buffer ring + // ========================================================================= + + private void InitBufferRing() + { + nuint ringBytes = (nuint)BufferRingEntries * 16; + _bufRing = (byte*)NativeMemory.AlignedAlloc(ringBytes, 4096); + NativeMemory.Clear(_bufRing, ringBytes); + + nuint slabBytes = BufferRingEntries * (nuint)RecvBufferSize; + _bufSlab = (byte*)NativeMemory.AlignedAlloc(slabBytes, 64); + + _bufRingMask = BufferRingEntries - 1; + + var reg = new io_uring_buf_reg { + ring_addr = (ulong)_bufRing, + ring_entries = BufferRingEntries, + bgid = BgId, + }; + + int ret = io_uring_register(Ring.Fd, IORING_REGISTER_PBUF_RING, ®, 1); + if (ret < 0) + { + int err = Marshal.GetLastPInvokeError(); + throw new InvalidOperationException($"register pbuf_ring failed: ret={ret} errno={err}"); + } + + for (ushort bid = 0; bid < BufferRingEntries; bid++) { + byte* slot = _bufRing + (uint)bid * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)RecvBufferSize); + *(uint*)(slot + 8) = RecvBufferSize; + *(ushort*)(slot + 12) = bid; + } + _bufRingTail = (ushort)BufferRingEntries; + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + + internal void ReturnBufferDirect(ushort bid) + { + byte* slot = _bufRing + (_bufRingTail & _bufRingMask) * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)RecvBufferSize); + *(uint*)(slot + 8) = RecvBufferSize; + *(ushort*)(slot + 12) = bid; + _bufRingTail++; + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + + // ========================================================================= + // Cross-thread entry points + // ========================================================================= + + public void EnqueueReturnQ(ushort bid) + { + if (Environment.CurrentManagedThreadId == _reactorThreadId) + { + ReturnBufferDirect(bid); + return; + } + SpinWait sw = default; + while (!_returnQ.TryEnqueue(bid)) sw.SpinOnce(); + WakeFdWrite(); + } + + internal void EnqueueRecycle(Connection conn) + { + if (Environment.CurrentManagedThreadId == _reactorThreadId) + { + Recycle(conn, conn.ClientFd); + return; + } + _recycleQ.Enqueue(conn); + WakeFdWrite(); + } + + private void WakeFdWrite() + { + ulong v = 1; + write(_wakeFd, &v, 8); + } + + private void DrainReturnQ() + { + while (_returnQ.TryDequeue(out ushort bid)) + { + ReturnBufferDirect(bid); + } + } + + private void DrainRecycleQ() + { + while (_recycleQ.TryDequeue(out Connection? conn)) + { + Recycle(conn, conn.ClientFd); + } + } + + private void ArmWakePoll() + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_POLL_ADD; + sqe->fd = _wakeFd; + sqe->op_flags = POLLIN; + sqe->len = IORING_POLL_ADD_MULTI; + sqe->user_data = KindWake | (uint)_wakeFd; + } + + // ========================================================================= + // Main loop + // ========================================================================= + + public void Run() + { + _reactorThreadId = Environment.CurrentManagedThreadId; + + Ring = Ring.Create(_ringEntries); + _listenFd = OpenReusePortListener(_port); + + InitBufferRing(); + + _wakeFd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (_wakeFd < 0) + { + throw new InvalidOperationException("eventfd failed"); + } + + Console.WriteLine($"[r{Id}] listening on 0.0.0.0:{_port}"); + SubmitAcceptMultishot(); + ArmWakePoll(); + + LoopShared(); + + close(_listenFd); + close(_wakeFd); + Ring.Dispose(); + } + + private void LoopShared() + { + while (true) + { + DrainReturnQ(); + DrainRecycleQ(); + + int rc = Ring.SubmitAndWait(1); + if (rc < 0 && rc != -EINTR && rc != -EAGAIN && rc != -EBUSY) + { + Console.Error.WriteLine($"[r{Id}] io_uring_enter failed: {rc}"); + break; + } + + uint ready = Ring.CqReady(); + for (uint i = 0; i < ready; i++) + { + Dispatch(in Ring.CqeAt(i)); + } + Ring.CqAdvance(ready); + } + } + + private void Dispatch(in IoUringCqe cqe) + { + ulong kind = cqe.user_data & 0xffffffff_00000000UL; + int fd = (int)(cqe.user_data & 0xffffffffUL); + bool more = (cqe.flags & IORING_CQE_F_MORE) != 0; + + if (kind == KindWake) + { + ulong drain; + read(_wakeFd, &drain, 8); + if (!more) ArmWakePoll(); + return; + } + + if (kind == KindAccept) + { + if (cqe.res >= 0) + { + int clientFd = cqe.res; + SetNoDelay(clientFd); + Connection conn = _pool.TryPop(out var pooled) + ? pooled.SetFd(clientFd) + : new Connection(this, clientFd, _config.WriteSlabSize); + Connections[clientFd] = conn; + conn.InitRefs(); + SubmitRecvMultishot(clientFd); + + _ = _config.UsePipe + ? Handler.HandlePipeAsync(this, conn) + : Handler.HandleAsync(this, conn); + } + else + { + Console.Error.WriteLine($"[r{Id}] accept error: {cqe.res}"); + } + if (!more) SubmitAcceptMultishot(); + } + else if (kind == KindRecv) + { + bool hasBuf = (cqe.flags & IORING_CQE_F_BUFFER) != 0; + ushort bid = hasBuf ? (ushort)(cqe.flags >> IORING_CQE_BUFFER_SHIFT) : (ushort)0; + + if (cqe.res <= 0) + { + if (hasBuf) ReturnBufferDirect(bid); + if (Connections.Remove(fd, out var dyingConn)) + { + dyingConn.MarkClosed(); + dyingConn.DecRef(); + } + return; + } + + if (!Connections.TryGetValue(fd, out var conn)) + { + if (hasBuf) ReturnBufferDirect(bid); + return; + } + + byte* ptr = hasBuf ? _bufSlab + (nuint)bid * (nuint)RecvBufferSize : null; + conn.Complete(cqe.res, bid, hasBuf, ptr); + + if (!more) SubmitRecvMultishot(fd); + } + } + + // ========================================================================= + // SQE producers — reactor-thread only (no send op; that's libc send() in + // the handler). + // ========================================================================= + + private IoUringSqe* GetSqeOrFlush() + { + IoUringSqe* sqe = Ring.GetSqe(); + if (sqe != null) return sqe; + Ring.SubmitAndWait(0); + sqe = Ring.GetSqe(); + if (sqe == null) throw new InvalidOperationException("SQ full after flush"); + return sqe; + } + + private void SubmitAcceptMultishot() + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_ACCEPT; + sqe->ioprio = IORING_ACCEPT_MULTISHOT; + sqe->fd = _listenFd; + sqe->user_data = KindAccept | (uint)_listenFd; + } + + private void SubmitRecvMultishot(int fd) + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_RECV; + sqe->flags = IOSQE_BUFFER_SELECT; + sqe->ioprio = IORING_RECV_MULTISHOT; + sqe->fd = fd; + sqe->buf_index = BgId; + sqe->user_data = KindRecv | (uint)fd; + } + + private void Recycle(Connection conn, int fd) + { + conn.MarkClosed(); + conn.DrainRecv(); + close(fd); + conn.Clear(); + + if (_pool.Count < PoolMax) + { + _pool.Push(conn); + } + else + { + conn.Dispose(); + } + } + + private static void SetNoDelay(int fd) + { + int one = 1; + setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(int)); + } + + private static int OpenReusePortListener(ushort port) + { + int fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) throw new InvalidOperationException($"socket failed: {fd}"); + + int one = 1; + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(int)); + setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(int)); + + sockaddr_in addr = default; + addr.sin_family = AF_INET; + addr.sin_port = Htons(port); + addr.sin_addr.s_addr = 0; + + if (bind(fd, &addr, (uint)sizeof(sockaddr_in)) < 0) + throw new InvalidOperationException("bind failed"); + if (listen(fd, 128) < 0) + throw new InvalidOperationException("listen failed"); + return fd; + } +} diff --git a/MinimaTFlow/ServerConfig.cs b/MinimaTFlow/ServerConfig.cs new file mode 100644 index 0000000..8e09ae3 --- /dev/null +++ b/MinimaTFlow/ServerConfig.cs @@ -0,0 +1,35 @@ +namespace MinimaTFlow; + +/// +/// All server tunables in one place — replaces the consts that used to be +/// scattered across Program.cs and Reactor.cs. Defaults match the previous +/// hardcoded values; override via object initializer in Main, e.g.: +/// new ServerConfig { Port = 9000, ReactorCount = 8, Incremental = true }. +/// +public sealed record ServerConfig +{ + // Server-level. + public ushort Port { get; init; } = 8080; + public int ReactorCount { get; init; } = 12; + + // Handler style: false = raw ReadAsync/TryGetItem loop; true = PipeReader/PipeWriter. + public bool UsePipe { get; init; } = false; + + // io_uring SQ/CQ depth. + public uint RingEntries { get; init; } = 8192; + + // Shared buffer ring (used when Incremental == false). + public int RecvBufferSize { get; init; } = 32 * 1024; + public int BufferRingEntries { get; init; } = 4096; + + // Per-connection write slab + connection pool cap. + public int WriteSlabSize { get; init; } = 16 * 1024; + public int PoolMax { get; init; } = 1024; + + // Incremental mode (IOU_PBUF_RING_INC) — per-connection rings. + // reserved native memory ≈ PoolMax × ConnBufRingEntries × IncRecvBufferSize × ReactorCount. + public bool Incremental { get; init; } = false; + public int MaxConnections { get; init; } = 4096; // GID cap (one bgid per active connection) + public int ConnBufRingEntries { get; init; } = 16; // buffers per connection ring + public int IncRecvBufferSize { get; init; } = 4096; // bytes per buffer (filled incrementally) +} diff --git a/MinimaTFlow/Utils/Mpsc.cs b/MinimaTFlow/Utils/Mpsc.cs new file mode 100644 index 0000000..0575711 --- /dev/null +++ b/MinimaTFlow/Utils/Mpsc.cs @@ -0,0 +1,115 @@ +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTFlow.Utils; + +/// +/// Bounded lock-free multi-producer / single-consumer queue. +/// +/// Dmitry Vyukov's bounded MPMC algorithm, specialised to one consumer. +/// Power-of-two capacity, zero-allocation after construction. Producers claim a +/// slot via CAS on the enqueue position (a failed TryEnqueue on a full queue +/// leaves the position untouched — no burned tickets); the single consumer +/// advances the dequeue position with a plain write. Each slot carries a +/// sequence number that coordinates ownership between producers and consumer. +/// +/// One generic queue serves every reactor handoff: Mpsc<ushort> for buffer +/// returns, Mpsc<int> for flush fds, Mpsc<ulong> for packed incremental +/// returns. T is unmanaged so each Cell is a blittable value type with no GC refs. +/// +internal sealed class Mpsc where T : unmanaged +{ + private struct Cell + { + public long Sequence; + public T Value; + } + + private readonly Cell[] _buffer; + private readonly int _mask; + + // PaddedLong is a top-level struct (not nested here) because the CLR forbids + // explicit layout on a type nested inside a generic. + private PaddedLong _enqueuePos; + private PaddedLong _dequeuePos; + + public Mpsc(int capacityPow2) + { + if (capacityPow2 < 2 || (capacityPow2 & (capacityPow2 - 1)) != 0) + throw new ArgumentException("Capacity must be a power of two >= 2.", nameof(capacityPow2)); + + _buffer = new Cell[capacityPow2]; + _mask = capacityPow2 - 1; + + for (int i = 0; i < capacityPow2; i++) + _buffer[i].Sequence = i; + } + + /// Multi-producer safe. Returns false if the queue is full. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryEnqueue(T item) + { + Cell[] buffer = _buffer; + int mask = _mask; + + while (true) + { + long pos = Volatile.Read(ref _enqueuePos.Value); + ref Cell cell = ref buffer[(int)pos & mask]; + + long seq = Volatile.Read(ref cell.Sequence); + long dif = seq - pos; + + if (dif == 0) + { + if (Interlocked.CompareExchange(ref _enqueuePos.Value, pos + 1, pos) == pos) + { + cell.Value = item; + Volatile.Write(ref cell.Sequence, pos + 1); + return true; + } + continue; // lost the race; reload and retry + } + + if (dif < 0) + return false; // slot not yet consumed → full + } + } + + /// Single-consumer only. Returns false if empty. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeue(out T item) + { + Cell[] buffer = _buffer; + int mask = _mask; + + long pos = _dequeuePos.Value; // single consumer: plain read + ref Cell cell = ref buffer[(int)pos & mask]; + + long seq = Volatile.Read(ref cell.Sequence); + long dif = seq - (pos + 1); + + if (dif == 0) + { + item = cell.Value; + _dequeuePos.Value = pos + 1; // single consumer: plain write + Volatile.Write(ref cell.Sequence, pos + mask + 1); // free slot for producers + return true; + } + + item = default; + return false; + } +} + +/// +/// A single long padded to a 64-byte cache line so the producer and consumer +/// positions never share a line (no false sharing). Top-level and non-generic +/// so it can legally use explicit layout. +/// +[StructLayout(LayoutKind.Explicit, Size = 64)] +internal struct PaddedLong +{ + [FieldOffset(0)] public long Value; +} diff --git a/MinimaTFlow/Utils/RingSegment.cs b/MinimaTFlow/Utils/RingSegment.cs new file mode 100644 index 0000000..758f975 --- /dev/null +++ b/MinimaTFlow/Utils/RingSegment.cs @@ -0,0 +1,31 @@ +using System.Buffers; + +namespace MinimaTFlow.Utils; + +/// +/// One segment of a multi-buffer ReadOnlySequence<byte> built by the +/// ConnectionPipeReader when a single read spans more than one recv buffer. +/// BufferId is carried for debugging; buffer return is driven off the held +/// item list, not the segments. +/// +public sealed class RingSegment : ReadOnlySequenceSegment +{ + public ushort BufferId { get; } + + public RingSegment(ReadOnlyMemory memory, ushort bufferId) + { + Memory = memory; + BufferId = bufferId; + } + + public RingSegment Append(ReadOnlyMemory memory, ushort bufferId) + { + var next = new RingSegment(memory, bufferId) + { + RunningIndex = RunningIndex + Memory.Length + }; + + Next = next; + return next; + } +} diff --git a/MinimaTFlow/Utils/SpscRecvRing.cs b/MinimaTFlow/Utils/SpscRecvRing.cs new file mode 100644 index 0000000..13d434a --- /dev/null +++ b/MinimaTFlow/Utils/SpscRecvRing.cs @@ -0,0 +1,105 @@ +using System.Runtime.CompilerServices; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTFlow.Utils; + +public sealed unsafe class SpscRecvRing +{ + public struct Item + { + public byte* Ptr; + public ushort Bid; + public int Len; + public bool HasBuffer; + public ushort Gen; // connection generation when enqueued (incremental return guard) + + public ReadOnlySpan AsSpan() => new(Ptr, Len); + + public UnmanagedMemoryManager AsMemoryManager() => new(Ptr, Len, Bid); + } + + private readonly Item[] _items; + private readonly int _mask; + private long _tail; + private long _head; + + public SpscRecvRing(int capacityPow2) + { + if (capacityPow2 <= 0 || (capacityPow2 & (capacityPow2 - 1)) != 0) + { + throw new ArgumentException("capacity must be a power of two", nameof(capacityPow2)); + } + + _items = new Item[capacityPow2]; + _mask = capacityPow2 - 1; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryEnqueue(in Item item) + { + long head = Volatile.Read(ref _head); + long tail = _tail; + + if ((ulong)(tail - head) >= (ulong)_items.Length) + { + return false; + } + + _items[(int)(tail & _mask)] = item; + Volatile.Write(ref _tail, tail + 1); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeue(out Item item) + { + long head = _head; + long tail = Volatile.Read(ref _tail); + + if (head >= tail) + { + item = default; + return false; + } + + item = _items[(int)(head & _mask)]; + Volatile.Write(ref _head, head + 1); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long SnapshotTail() => Volatile.Read(ref _tail); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeueUntil(long tailSnapshot, out Item item) + { + long head = _head; + + if (head >= tailSnapshot) + { + item = default; + return false; + } + + item = _items[(int)(head & _mask)]; + Volatile.Write(ref _head, head + 1); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool IsEmpty() => Volatile.Read(ref _head) >= Volatile.Read(ref _tail); + + // Reactor-thread-only, called during connection teardown (Clear) when no + // handler is consuming. Discards any leftover items so the recycled + // connection starts empty. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Reset() + { + _head = 0; + _tail = 0; + } +} diff --git a/MinimaTFlow/Utils/UnmanagedMemoryManager.cs b/MinimaTFlow/Utils/UnmanagedMemoryManager.cs new file mode 100644 index 0000000..ad7a2f1 --- /dev/null +++ b/MinimaTFlow/Utils/UnmanagedMemoryManager.cs @@ -0,0 +1,32 @@ +using System.Buffers; + +namespace MinimaTFlow.Utils; + +public sealed unsafe class UnmanagedMemoryManager : MemoryManager +{ + private readonly byte* _ptr; + private readonly int _length; + + public ushort BufferId { get; } + + public UnmanagedMemoryManager(byte* ptr, int length) + { + _ptr = ptr; + _length = length; + } + + public UnmanagedMemoryManager(byte* ptr, int length, ushort bufferId) + { + _ptr = ptr; + _length = length; + BufferId = bufferId; + } + + public override Span GetSpan() => new(_ptr, _length); + + public override MemoryHandle Pin(int elementIndex = 0) => new(_ptr + elementIndex); + + public override void Unpin() { } + + protected override void Dispose(bool disposing) { } +} diff --git a/MinimaTFlow/io_uring/Native.cs b/MinimaTFlow/io_uring/Native.cs new file mode 100644 index 0000000..71e79b6 --- /dev/null +++ b/MinimaTFlow/io_uring/Native.cs @@ -0,0 +1,170 @@ +using System.Runtime.InteropServices; + +namespace MinimaTFlow; + +/// +/// All native interop in one file: io_uring syscalls, libc socket calls, +/// the kernel struct layouts they expect, and the constants needed to +/// drive a minimal io_uring loop. +/// +public static unsafe class Native { + private const long SYS_IO_URING_SETUP = 425; + private const long SYS_IO_URING_ENTER = 426; + private const long SYS_IO_URING_REGISTER = 427; + + public const byte IORING_OP_POLL_ADD = 6; + public const byte IORING_OP_ACCEPT = 13; + public const byte IORING_OP_SEND = 26; + public const byte IORING_OP_RECV = 27; + public const uint IORING_ENTER_GETEVENTS = 1u << 0; + public const long IORING_OFF_SQ_RING = 0; + public const long IORING_OFF_SQES = 0x10000000; + + // Multishot / buffer-ring goodies. + public const ushort IORING_ACCEPT_MULTISHOT = 1 << 0; + public const ushort IORING_RECV_MULTISHOT = 1 << 1; + public const byte IOSQE_BUFFER_SELECT = 1 << 5; + public const uint IORING_CQE_F_BUFFER = 1u << 0; + public const uint IORING_CQE_F_MORE = 1u << 1; + public const int IORING_CQE_BUFFER_SHIFT = 16; + public const uint IORING_REGISTER_PBUF_RING = 22; + public const uint IORING_UNREGISTER_PBUF_RING = 23; + public const uint IORING_POLL_ADD_MULTI = 1u << 0; + + // Incremental provided-buffer consumption (kernel 6.12+). IOU_PBUF_RING_INC + // is set in io_uring_buf_reg.flags at registration; IORING_CQE_F_BUF_MORE is + // set on recv CQEs while the kernel will keep appending to the same buffer. + public const ushort IOU_PBUF_RING_INC = 2; + public const uint IORING_CQE_F_BUF_MORE = 1u << 4; + + // eventfd flags + poll mask (used for the cross-thread wake mechanism). + public const int EFD_CLOEXEC = 0x80000; + public const int EFD_NONBLOCK = 0x800; + public const uint POLLIN = 0x0001; + + // Setup flags. SINGLE_ISSUER tells the kernel only one thread will submit + // to this ring (skips locking on the SQ). DEFER_TASKRUN defers completion + // processing until io_uring_enter(GETEVENTS), which lets the kernel batch + // work and avoids interrupting the reactor with task_work mid-flight. + public const uint IORING_SETUP_SINGLE_ISSUER = 1u << 12; + public const uint IORING_SETUP_DEFER_TASKRUN = 1u << 13; + + public const int PROT_READ = 1; + public const int PROT_WRITE = 2; + public const int MAP_SHARED = 1; + public const int MAP_POPULATE = 0x8000; + + public const int AF_INET = 2; + public const int SOCK_STREAM = 1; + public const int SOL_SOCKET = 1; + public const int SO_REUSEADDR = 2; + public const int SO_REUSEPORT = 15; + public const int IPPROTO_TCP = 6; + public const int TCP_NODELAY = 1; + + [DllImport("libc", EntryPoint = "syscall")] + private static extern long syscall3(long nr, uint a1, IoUringParams* a2); + + [DllImport("libc", EntryPoint = "syscall")] + private static extern long syscall6(long nr, uint a1, uint a2, uint a3, uint a4, void* a5, nuint a6); + + [DllImport("libc", EntryPoint = "syscall", SetLastError = true)] + private static extern long syscall4(long nr, uint a1, uint a2, void* a3, uint a4); + + public static int io_uring_setup(uint entries, IoUringParams* p) => + (int)syscall3(SYS_IO_URING_SETUP, entries, p); + + public static int io_uring_enter(int fd, uint toSubmit, uint minComplete, uint flags) => + (int)syscall6(SYS_IO_URING_ENTER, (uint)fd, toSubmit, minComplete, flags, null, 0); + + public static int io_uring_register(int fd, uint opcode, void* arg, uint nrArgs) => + (int)syscall4(SYS_IO_URING_REGISTER, (uint)fd, opcode, arg, nrArgs); + + [DllImport("libc")] public static extern void* mmap(void* addr, nuint length, int prot, int flags, int fd, long offset); + [DllImport("libc")] public static extern int munmap(void* addr, nuint length); + [DllImport("libc")] public static extern int close(int fd); + [DllImport("libc")] public static extern int socket(int domain, int type, int proto); + [DllImport("libc")] public static extern int bind(int fd, sockaddr_in* addr, uint len); + [DllImport("libc")] public static extern int listen(int fd, int backlog); + [DllImport("libc")] public static extern int setsockopt(int fd, int level, int optname, void* optval, uint optlen); + [DllImport("libc")] public static extern int eventfd(uint initval, int flags); + [DllImport("libc")] public static extern long write(int fd, void* buf, nuint count); + [DllImport("libc")] public static extern long read(int fd, void* buf, nuint count); + // Inline send used by the handler thread for the response path — keeps the + // send off the io_uring reactor entirely (Twinflow-style). + [DllImport("libc", SetLastError = true)] public static extern long send(int fd, byte* buf, nuint len, int flags); + + public const int MSG_NOSIGNAL = 0x4000; + public const int EAGAIN = 11; + public const int EWOULDBLOCK = 11; + public const int EINTR = 4; + + public static ushort Htons(ushort x) => (ushort)((x << 8) | (x >> 8)); + + // Kernel struct layouts (must match include/uapi/linux/io_uring.h) + [StructLayout(LayoutKind.Sequential)] + public struct SqRingOffsets { + public uint head, tail, ring_mask, ring_entries, flags, dropped, array, resv1; + public ulong resv2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct CqRingOffsets { + public uint head, tail, ring_mask, ring_entries, overflow, cqes, flags, resv1; + public ulong resv2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct IoUringParams { + public uint sq_entries, cq_entries, flags, sq_thread_cpu, sq_thread_idle; + public uint features, wq_fd, resv0, resv1, resv2; + public SqRingOffsets sq_off; + public CqRingOffsets cq_off; + } + + [StructLayout(LayoutKind.Explicit, Size = 64)] + public struct IoUringSqe { + [FieldOffset(0)] public byte opcode; + [FieldOffset(1)] public byte flags; + [FieldOffset(2)] public ushort ioprio; + [FieldOffset(4)] public int fd; + [FieldOffset(8)] public ulong off; + [FieldOffset(16)] public ulong addr; + [FieldOffset(24)] public uint len; + [FieldOffset(28)] public uint op_flags; + [FieldOffset(32)] public ulong user_data; + [FieldOffset(40)] public ushort buf_index; + [FieldOffset(42)] public ushort personality; + [FieldOffset(44)] public int splice_fd_in; + [FieldOffset(48)] public ulong addr3; + [FieldOffset(56)] public ulong __pad2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct IoUringCqe { + public ulong user_data; + public int res; + public uint flags; + } + + // Argument struct for IORING_REGISTER_PBUF_RING. + [StructLayout(LayoutKind.Sequential)] + public struct io_uring_buf_reg { + public ulong ring_addr; + public uint ring_entries; + public ushort bgid; + public ushort flags; + public ulong resv1, resv2, resv3; + } + + [StructLayout(LayoutKind.Sequential)] + public struct in_addr { public uint s_addr; } + + [StructLayout(LayoutKind.Sequential)] + public unsafe struct sockaddr_in { + public ushort sin_family; + public ushort sin_port; + public in_addr sin_addr; + public fixed byte sin_zero[8]; + } +} diff --git a/MinimaTFlow/io_uring/Ring.cs b/MinimaTFlow/io_uring/Ring.cs new file mode 100644 index 0000000..2d4837f --- /dev/null +++ b/MinimaTFlow/io_uring/Ring.cs @@ -0,0 +1,179 @@ +using System.Runtime.CompilerServices; +using static MinimaTFlow.Native; + +// ReSharper disable SuggestVarOrType_BuiltInTypes +// ReSharper disable SuggestVarOrType_Elsewhere +#pragma warning disable CA1806 + +namespace MinimaTFlow; + +public sealed unsafe class Ring : IDisposable +{ + private int _fd; + + public int Fd => _fd; + + private uint* _sqHead; + private uint* _sqTail; + private uint* _sqArray; + private uint _sqMask; + private uint _sqEntries; + private IoUringSqe* _sqes; + + private uint* _cqHead; + private uint* _cqTail; + private IoUringCqe* _cqes; + private uint _cqMask; + + private uint _sqeTail; + + private byte* _ringPtr; + private nuint _ringSize; + private byte* _sqePtr; + private nuint _sqeSize; + + public static Ring Create(uint entries) + { + IoUringParams ioUringParams = default; + ioUringParams.flags = IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN; + int fd = io_uring_setup(entries, &ioUringParams); + if (fd < 0) + { + throw new InvalidOperationException($"io_uring_setup failed: {fd}"); + } + + var ring = new Ring + { + _fd = fd, + _sqEntries = ioUringParams.sq_entries + }; + + nuint sqRingBytes = ioUringParams.sq_off.array + ioUringParams.sq_entries * sizeof(uint); + nuint cqRingBytes = ioUringParams.cq_off.cqes + ioUringParams.cq_entries * (nuint)sizeof(IoUringCqe); + nuint ringBytes = sqRingBytes > cqRingBytes ? sqRingBytes : cqRingBytes; + + void* ringMem = mmap(null, ringBytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); + if (ringMem == (void*)-1) + { + close(fd); + + throw new InvalidOperationException("mmap(SQ_RING) failed"); + } + ring._ringPtr = (byte*)ringMem; + ring._ringSize = ringBytes; + + nuint sqeBytes = ioUringParams.sq_entries * (nuint)sizeof(IoUringSqe); + void* sqeMem = mmap(null, sqeBytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); + if (sqeMem == (void*)-1) + { + munmap(ringMem, ringBytes); + close(fd); + + throw new InvalidOperationException("mmap(SQES) failed"); + } + ring._sqes = (IoUringSqe*)sqeMem; + ring._sqePtr = (byte*)sqeMem; + ring._sqeSize = sqeBytes; + + byte* ringPointer = (byte*)ringMem; + ring._sqHead = (uint*)(ringPointer + ioUringParams.sq_off.head); + ring._sqTail = (uint*)(ringPointer + ioUringParams.sq_off.tail); + ring._sqArray = (uint*)(ringPointer + ioUringParams.sq_off.array); + ring._sqMask = *(uint*)(ringPointer + ioUringParams.sq_off.ring_mask); + + ring._cqHead = (uint*)(ringPointer + ioUringParams.cq_off.head); + ring._cqTail = (uint*)(ringPointer + ioUringParams.cq_off.tail); + ring._cqes = (IoUringCqe*)(ringPointer + ioUringParams.cq_off.cqes); + ring._cqMask = *(uint*)(ringPointer + ioUringParams.cq_off.ring_mask); + + return ring; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public IoUringSqe* GetSqe() + { + uint head = Volatile.Read(ref *_sqHead); + + if (_sqeTail - head >= _sqEntries) + { + return null; + } + + uint slot = _sqeTail & _sqMask; + _sqArray[slot] = slot; + _sqeTail++; + + return &_sqes[slot]; + } + + public int SubmitAndWait(uint waitFor) + { + uint published = *_sqTail; + uint toSubmit = _sqeTail - published; + + if (toSubmit > 0) + { + Volatile.Write(ref *_sqTail, _sqeTail); + } + + if (toSubmit == 0 && waitFor == 0) return 0; + + uint flags = waitFor > 0 ? IORING_ENTER_GETEVENTS : 0; + + return io_uring_enter(_fd, toSubmit, waitFor, flags); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryGetCqe(out IoUringCqe cqe) + { + uint head = *_cqHead; + uint tail = Volatile.Read(ref *_cqTail); + + if (head == tail) + { + cqe = default; + + return false; + } + + cqe = _cqes[head & _cqMask]; + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void CqeSeen() => Volatile.Write(ref *_cqHead, *_cqHead + 1); + + // Batched CQ drain (liburing io_uring_for_each_cqe + io_uring_cq_advance): + // read the kernel-written tail once (acquire), process the whole batch, + // then publish the consumed head once (release) instead of once per CQE. + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint CqReady() => Volatile.Read(ref *_cqTail) - *_cqHead; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ref readonly IoUringCqe CqeAt(uint i) => ref _cqes[(*_cqHead + i) & _cqMask]; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void CqAdvance(uint n) => Volatile.Write(ref *_cqHead, *_cqHead + n); + + public void Dispose() + { + if (_ringPtr != null) + { + munmap(_ringPtr, _ringSize); _ringPtr = null; + } + + if (_sqePtr != null) + { + munmap(_sqePtr, _sqeSize); _sqePtr = null; + } + + if (_fd > 0) + { + close(_fd); _fd = 0; + } + } +} + +#pragma warning restore CA1806 diff --git a/MinimaTPool/Connection/Connection.Incremental.cs b/MinimaTPool/Connection/Connection.Incremental.cs new file mode 100644 index 0000000..d6aaa20 --- /dev/null +++ b/MinimaTPool/Connection/Connection.Incremental.cs @@ -0,0 +1,61 @@ +using System.Runtime.InteropServices; +using MinimaTPool.Utils; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTPool; + +/// +/// Incremental-mode (IOU_PBUF_RING_INC) per-connection buffer-ring state. +/// Each connection owns its own ring + slab; one buffer accumulates this +/// connection's byte stream across many recvs. The reactor (Reactor.Incremental) +/// drives setup/teardown and the refcounted recycle; this partial just holds the +/// state and routes a handler return to the right reactor entry point. +/// +/// All of these stay allocated across pool reuse and are freed in Dispose(). +/// +public sealed unsafe partial class Connection +{ + internal byte* BufRing; // kernel-shared ring control area + internal byte* BufSlab; // this connection's recv slab + internal ushort Bgid; + internal uint BufRingMask; + internal int BufRingEntries; + internal bool IncrementalMode; + + internal int[]? CumOffset; // per-bid: byte offset where the next slice begins + internal int[]? RefCount; // per-bid: outstanding handler refs + internal bool[]? KernelDone; // per-bid: kernel finished appending (no F_BUF_MORE) + + internal int Generation => Volatile.Read(ref _generation); + + /// + /// Called by the handler to hand a consumed recv buffer back. Routes by mode: + /// incremental returns carry (fd, gen, bid) for refcounted recycle; the shared + /// path returns the bare bid to the reactor's single buf_ring. + /// + public void ReturnBuffer(in SpscRecvRing.Item item) + { + if (IncrementalMode) + { + _reactor.EnqueueReturnQIncremental(ClientFd, item.Gen, item.Bid); + } + else + { + _reactor.EnqueueReturnQ(item.Bid); + } + } + + private void DisposeIncremental() + { + if (BufRing != null) + { + NativeMemory.AlignedFree(BufRing); + BufRing = null; + } + if (BufSlab != null) + { + NativeMemory.AlignedFree(BufSlab); + BufSlab = null; + } + } +} diff --git a/MinimaTPool/Connection/Connection.Read.cs b/MinimaTPool/Connection/Connection.Read.cs new file mode 100644 index 0000000..bf2aced --- /dev/null +++ b/MinimaTPool/Connection/Connection.Read.cs @@ -0,0 +1,168 @@ +using System.Threading.Tasks.Sources; +using MinimaTPool.Utils; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTPool; + +/// +/// Per-connection state. The handler may run on any thread (e.g. resumed by +/// a thread-pool timer); reactor-only side effects are funnelled through the +/// MPSC queues on `Reactor`. Coordination uses Interlocked.Exchange on the +/// arm flags and a sticky `_pending` to close the lost-wakeup race. +/// +/// Lifetime is pool-managed: the reactor pops a Connection on accept (or new +/// one if pool is empty), and pushes it back on teardown after `Clear()`. The +/// `_generation` field is bumped on each `Clear` so stale `ValueTask` tokens +/// from a previous connection life are detectable and return `Closed()` +/// instead of leaking the new tenant's state. +/// +public sealed unsafe partial class Connection : IValueTaskSource +{ + internal Connection SetFd(int fd) + { + ClientFd = fd; + return this; + } + + private ManualResetValueTaskSourceCore _readSignal = new() + { + // Always resume the handler on the thread pool — never inline on the reactor. + // This is the key knob that distinguishes MinimaTPool from Minima. + RunContinuationsAsynchronously = true, + }; + private int _armed; + private int _pending; + private int _closed; + + private readonly SpscRecvRing _recv = new(capacityPow2: 16); + + public ValueTask ReadAsync() + { + if (!_recv.IsEmpty() || Volatile.Read(ref _pending) == 1) + { + Volatile.Write(ref _pending, 0); + return new ValueTask( + new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + + if (Volatile.Read(ref _closed) != 0) + { + return new ValueTask(RecvSnapshot.Closed()); + } + + if (Interlocked.Exchange(ref _armed, 1) == 1) + { + throw new InvalidOperationException("ReadAsync already armed."); + } + + // Snapshot the generation as the IVTS token so a future Clear() can + // invalidate this awaiter if the connection gets pool-recycled. + int gen = Volatile.Read(ref _generation); + + // Race recovery: re-check between arming and returning the IVTS task. + if (!_recv.IsEmpty() || Volatile.Read(ref _pending) == 1 || Volatile.Read(ref _closed) != 0) + { + Volatile.Write(ref _pending, 0); + Interlocked.Exchange(ref _armed, 0); + + return new ValueTask( + new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + + return new ValueTask(this, (short)gen); + } + + public bool TryGetItem(in RecvSnapshot snap, out SpscRecvRing.Item item) + => _recv.TryDequeueUntil(snap.Tail, out item); + + public void ResetRead() => _readSignal.Reset(); + + public void Complete(int res, ushort bid, bool hasBuffer, byte* ptr) + { + if (!_recv.TryEnqueue(new SpscRecvRing.Item + { + Ptr = ptr, + Bid = bid, + Len = res, + HasBuffer = hasBuffer, + Gen = (ushort)Volatile.Read(ref _generation) + })) + { + Console.Error.WriteLine("[conn] recv queue overflow."); + if (hasBuffer) + { + _reactor.ReturnBufferDirect(bid); + } + Volatile.Write(ref _closed, 1); + } + + if (Interlocked.Exchange(ref _armed, 0) == 1) + { + _readSignal.SetResult(new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + else + { + Volatile.Write(ref _pending, 1); + } + } + + internal void DrainRecv() + { + // Return any buffer IDs still sitting in the SPSC ring (handler exited + // before draining them, or a recv arrived after _closed was set). + while (_recv.TryDequeue(out SpscRecvRing.Item item)) + { + if (item.HasBuffer) + { + _reactor.ReturnBufferDirect(item.Bid); + } + } + } + + // ========================================================================= + // IValueTaskSource plumbing — token (= snapshot of `_generation` at await + // time) is compared against the current `_generation` to detect stale + // awaiters from before a Clear()/pool reuse. Stale awaiters get a + // sentinel result rather than the new tenant's state. + // + // For the actual IVTS dispatch we pass `_readSignal.Version` / + // `_flushSignal.Version` to the underlying core (not `token`) because the + // core's version is bumped by ResetRead/CompleteFlush mid-life and is + // unrelated to the cross-life generation guard. + // ========================================================================= + + RecvSnapshot IValueTaskSource.GetResult(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return RecvSnapshot.Closed(); + } + + return _readSignal.GetResult(_readSignal.Version); + } + + ValueTaskSourceStatus IValueTaskSource.GetStatus(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return ValueTaskSourceStatus.Succeeded; + } + + return _readSignal.GetStatus(_readSignal.Version); + } + + void IValueTaskSource.OnCompleted(Action continuation, object? state, short token, ValueTaskSourceOnCompletedFlags flags) + { + if (token != (short)Volatile.Read(ref _generation)) + { + // Stale — run the continuation now so the awaiter unblocks and + // gets RecvSnapshot.Closed() from GetResult. + continuation(state); + + return; + } + + _readSignal.OnCompleted(continuation, state, _readSignal.Version, flags); + } +} diff --git a/MinimaTPool/Connection/Connection.Write.cs b/MinimaTPool/Connection/Connection.Write.cs new file mode 100644 index 0000000..79eb82d --- /dev/null +++ b/MinimaTPool/Connection/Connection.Write.cs @@ -0,0 +1,187 @@ +using System.Buffers; +using System.Threading.Tasks.Sources; +using MinimaTPool.Utils; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTPool; + +public sealed unsafe partial class Connection : IValueTaskSource, IBufferWriter +{ + private readonly int _writeSlabSize; + internal byte* WriteBuffer; + internal int WriteHead; + internal int WriteTail; + internal int WriteInFlight; + + private readonly UnmanagedMemoryManager _manager; + + private ManualResetValueTaskSourceCore _flushSignal = new() + { + // Always resume the handler on the thread pool — never inline on the reactor. + // This is the key knob that distinguishes MinimaTPool from Minima. + RunContinuationsAsynchronously = true, + }; + private int _flushArmed; + private int _flushInProgress; + + // IBufferWrite +#region IBufferWrite + + public Memory GetMemory(int sizeHint = 0) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + int remaining = _writeSlabSize - WriteTail; + if (sizeHint > remaining) + { + throw new InvalidOperationException("Buffer too small."); + } + + return _manager.Memory.Slice(WriteTail, remaining); + } + + public Span GetSpan(int sizeHint = 0) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + if (WriteTail + sizeHint > _writeSlabSize) + { + throw new InvalidOperationException("Write buffer too small."); + } + + return new Span(WriteBuffer + WriteTail, _writeSlabSize - WriteTail); + } + + public void Advance(int count) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + WriteTail += count; + } + +#endregion + + // Write to the inner buffer + public void Write(ReadOnlySpan source) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + int len = source.Length; + if (WriteTail + len > _writeSlabSize) + { + throw new InvalidOperationException("Write buffer too small."); + } + + source.CopyTo(new Span(WriteBuffer + WriteTail, len)); + WriteTail += len; + } + + // Flush inner buffer data to the kernel + public ValueTask FlushAsync() + { + // Connection already torn down (reactor saw EOF/error → MarkClosed): don't flush + // a removed connection — the handoff would reach a reactor that no longer knows + // this fd and the awaiter would hang. Return completed so the handler unwinds to + // its next ReadAsync, sees IsClosed, and exits. + if (Volatile.Read(ref _closed) == 1) + { + return default; + } + + if (Interlocked.Exchange(ref _flushInProgress, 1) == 1) + { + throw new InvalidOperationException("FlushAsync already in progress."); + } + + int target = WriteTail; + if (target == 0) + { + Volatile.Write(ref _flushInProgress, 0); + + return default; + } + + if (Interlocked.Exchange(ref _flushArmed, 1) == 1) + { + throw new InvalidOperationException("FlushAsync already armed."); + } + + _flushSignal.Reset(); + WriteInFlight = target; + + int gen = Volatile.Read(ref _generation); + + _reactor.EnqueueFlush(ClientFd); + + // Race recovery (mirrors ReadAsync): if close raced in after the guard above, + // self-complete so we don't hang waiting on a send the reactor will never make. + if (Volatile.Read(ref _closed) == 1 && Interlocked.Exchange(ref _flushArmed, 0) == 1) + { + Volatile.Write(ref _flushInProgress, 0); + _flushSignal.SetResult(true); + } + + return new ValueTask(this, (short)gen); + } + + // Signal the FlushAsync was completed, called by the reactor's dispatcher send branch + internal void CompleteFlush() + { + WriteHead = 0; + WriteTail = 0; + WriteInFlight = 0; + Volatile.Write(ref _flushInProgress, 0); + Interlocked.Exchange(ref _flushArmed, 0); + + _flushSignal.SetResult(true); + } + + // IValueTaskSource +#region IValueTaskSource + + void IValueTaskSource.GetResult(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return; + } + + _flushSignal.GetResult(_flushSignal.Version); + } + + ValueTaskSourceStatus IValueTaskSource.GetStatus(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return ValueTaskSourceStatus.Succeeded; + } + + return _flushSignal.GetStatus(_flushSignal.Version); + } + + void IValueTaskSource.OnCompleted(Action continuation, object? state, short token, ValueTaskSourceOnCompletedFlags flags) + { + if (token != (short)Volatile.Read(ref _generation)) + { + continuation(state); + + return; + } + _flushSignal.OnCompleted(continuation, state, _flushSignal.Version, flags); + } + +#endregion +} \ No newline at end of file diff --git a/MinimaTPool/Connection/Connection.cs b/MinimaTPool/Connection/Connection.cs new file mode 100644 index 0000000..16eb291 --- /dev/null +++ b/MinimaTPool/Connection/Connection.cs @@ -0,0 +1,108 @@ +using System.Runtime.InteropServices; +using MinimaTPool.Utils; + +namespace MinimaTPool; + +public sealed unsafe partial class Connection +{ + private readonly Reactor _reactor; + + public int ClientFd { get; private set; } + + // Bumped on Clear(); the low 16 bits are used as the IVTS token so stale + // awaiters can be detected after pool reuse. + private int _generation; + + // Refcount: the connection has two owners — the reactor (recv side) and the + // handler (which may run off-reactor). Init to 2 on accept; each owner DecRef's + // when done; teardown (Recycle) runs only at refs==0, so a connection is never + // recycled or pool-reused while a handler is still in flight on another thread. + private int _refs; + + public Connection(Reactor reactor, int fd, int writeSlabSize = 1024 * 16) + { + _reactor = reactor; + ClientFd = fd; + _writeSlabSize = writeSlabSize; + WriteBuffer = (byte*)NativeMemory.AlignedAlloc((nuint)writeSlabSize, 64); + + _manager = new UnmanagedMemoryManager(WriteBuffer, writeSlabSize); + } + + // ========================================================================= + // Pool lifecycle — invoked from Reactor.Dispatch's recv/send error paths. + // Reactor-thread only. + // + // teardown: MarkClosed() → wake awaiters with closed=1 + // DrainRecv() → return any in-flight buf_ring items + // close(fd) + // Clear() → reset state, bump _generation + // push to pool, OR Dispose() if pool is full + // ========================================================================= + + public void MarkClosed() + { + Volatile.Write(ref _closed, 1); + + if (Interlocked.Exchange(ref _armed, 0) == 1) + { + _readSignal.SetResult(new RecvSnapshot(_recv.SnapshotTail(), isClosed: true)); + } + else + { + Volatile.Write(ref _pending, 1); + } + + if (Interlocked.Exchange(ref _flushArmed, 0) == 1) + { + Volatile.Write(ref _flushInProgress, 0); + _flushSignal.SetResult(true); + } + } + + // Init to 2 (reactor + handler) at accept. + internal void InitRefs() => Volatile.Write(ref _refs, 2); + + // Release one owner's ref. Whoever drives it to 0 hands the connection to the + // reactor for teardown (close + Clear + pool) — never recycled before both done. + internal void DecRef() + { + if (Interlocked.Decrement(ref _refs) == 0) + { + _reactor.EnqueueRecycle(this); + } + } + + internal void Clear() + { + // Bump generation first — readers of IVTS plumbing observe this via + // Volatile.Read and stale tokens get RecvSnapshot.Closed() / no-op. + Interlocked.Increment(ref _generation); + + Volatile.Write(ref _armed, 0); + Volatile.Write(ref _pending, 0); + Volatile.Write(ref _closed, 0); + Volatile.Write(ref _flushArmed, 0); + Volatile.Write(ref _flushInProgress, 0); + + WriteHead = 0; + WriteTail = 0; + WriteInFlight = 0; + + _readSignal.Reset(); + _flushSignal.Reset(); + + _recv.Reset(); // discard any leftover SPSC items + IncrementalMode = false; // per-conn ring (if any) was torn down before Clear + } + + public void Dispose() + { + if (WriteBuffer != null) + { + NativeMemory.AlignedFree(WriteBuffer); + WriteBuffer = null; + } + DisposeIncremental(); + } +} \ No newline at end of file diff --git a/MinimaTPool/Connection/ConnectionDualPipe.cs b/MinimaTPool/Connection/ConnectionDualPipe.cs new file mode 100644 index 0000000..1bd16fd --- /dev/null +++ b/MinimaTPool/Connection/ConnectionDualPipe.cs @@ -0,0 +1,16 @@ +using System.IO.Pipelines; + +namespace MinimaTPool; + +public sealed class ConnectionDualPipe : IDuplexPipe +{ + public PipeReader Input { get; } + public PipeWriter Output { get; } + + public ConnectionDualPipe(Connection connection) + { + ArgumentNullException.ThrowIfNull(connection); + Input = new ConnectionPipeReader(connection); + Output = new ConnectionPipeWriter(connection); + } +} \ No newline at end of file diff --git a/MinimaTPool/Connection/ConnectionPipeReader.cs b/MinimaTPool/Connection/ConnectionPipeReader.cs new file mode 100644 index 0000000..8bdce9c --- /dev/null +++ b/MinimaTPool/Connection/ConnectionPipeReader.cs @@ -0,0 +1,181 @@ +using System.Buffers; +using System.IO.Pipelines; +using MinimaTPool.Utils; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTPool; + +/// +/// Adapts Minima's raw read API (ReadAsync + TryGetItem +/// + ReturnBuffer) to a standard . Recv buffers are +/// exposed zero-copy as a ReadOnlySequence<byte> (one segment per buffer) +/// and held until AdvanceTo consumes them, at which point fully-consumed buffers +/// are returned to the reactor. +/// +/// Convenience/compat layer for PipeReader consumers — the raw ReadAsync/ +/// TryGetItem path stays the faster one (this adds held-buffer + sequence +/// bookkeeping per read). +/// +public sealed class ConnectionPipeReader : PipeReader +{ + private readonly Connection _conn; + private readonly List _held = new(16); + private ReadOnlySequence _lastSequence; + + private bool _completed; + private bool _cancelRequested; + private bool _connectionClosed; + + private readonly struct Held + { + public readonly ReadOnlyMemory Memory; + public readonly SpscRecvRing.Item Item; + + public Held(ReadOnlyMemory memory, SpscRecvRing.Item item) + { + Memory = memory; + Item = item; + } + + public Held WithMemory(ReadOnlyMemory memory) => new(memory, Item); + } + + public ConnectionPipeReader(Connection connection) + { + _conn = connection ?? throw new ArgumentNullException(nameof(connection)); + } + + public override async ValueTask ReadAsync(CancellationToken cancellationToken = default) + { + ThrowIfCompleted(); + + if (_cancelRequested) + { + _cancelRequested = false; + return new ReadResult(BuildSequence(), isCanceled: true, isCompleted: _connectionClosed); + } + + // Anything still held from a previous read that wasn't fully consumed. + if (_held.Count > 0) + return new ReadResult(BuildSequence(), isCanceled: false, isCompleted: _connectionClosed); + + if (_connectionClosed) + return new ReadResult(default, isCanceled: false, isCompleted: true); + + RecvSnapshot snap = await _conn.ReadAsync(); + + while (_conn.TryGetItem(snap, out SpscRecvRing.Item item)) + { + if (item.HasBuffer) + _held.Add(new Held(item.AsMemoryManager().Memory, item)); + } + + _conn.ResetRead(); + + if (snap.IsClosed) + _connectionClosed = true; + + if (_cancelRequested) + { + _cancelRequested = false; + return new ReadResult(BuildSequence(), isCanceled: true, isCompleted: _connectionClosed); + } + + return new ReadResult(BuildSequence(), isCanceled: false, isCompleted: _connectionClosed); + } + + public override bool TryRead(out ReadResult result) + { + ThrowIfCompleted(); + + if (_held.Count > 0) + { + result = new ReadResult(BuildSequence(), isCanceled: false, isCompleted: _connectionClosed); + return true; + } + + if (_connectionClosed) + { + result = new ReadResult(default, isCanceled: false, isCompleted: true); + return true; + } + + result = default; + return false; + } + + public override void AdvanceTo(SequencePosition consumed) => AdvanceTo(consumed, consumed); + + public override void AdvanceTo(SequencePosition consumed, SequencePosition examined) + { + if (_held.Count == 0) + return; + + long consumedBytes = _lastSequence.Slice(0, consumed).Length; + + while (_held.Count > 0 && consumedBytes > 0) + { + Held seg = _held[0]; + int available = seg.Memory.Length; + + if (consumedBytes >= available) + { + // Whole buffer consumed — return it to the reactor. + _conn.ReturnBuffer(seg.Item); + _held.RemoveAt(0); + consumedBytes -= available; + } + else + { + // Partial — keep the unconsumed tail of this buffer. + _held[0] = seg.WithMemory(seg.Memory[(int)consumedBytes..]); + consumedBytes = 0; + } + } + } + + public override void CancelPendingRead() => _cancelRequested = true; + + public override void Complete(Exception? exception = null) + { + if (_completed) + return; + + _completed = true; + + for (int i = 0; i < _held.Count; i++) + _conn.ReturnBuffer(_held[i].Item); + + _held.Clear(); + } + + private ReadOnlySequence BuildSequence() + { + if (_held.Count == 0) + { + _lastSequence = default; + return _lastSequence; + } + + if (_held.Count == 1) + { + _lastSequence = new ReadOnlySequence(_held[0].Memory); + return _lastSequence; + } + + var head = new RingSegment(_held[0].Memory, _held[0].Item.Bid); + RingSegment tail = head; + + for (int i = 1; i < _held.Count; i++) + tail = tail.Append(_held[i].Memory, _held[i].Item.Bid); + + _lastSequence = new ReadOnlySequence(head, 0, tail, tail.Memory.Length); + return _lastSequence; + } + + private void ThrowIfCompleted() + { + if (_completed) + throw new InvalidOperationException("Reading is not allowed after the reader was completed."); + } +} diff --git a/MinimaTPool/Connection/ConnectionPipeWriter.cs b/MinimaTPool/Connection/ConnectionPipeWriter.cs new file mode 100644 index 0000000..1598e2e --- /dev/null +++ b/MinimaTPool/Connection/ConnectionPipeWriter.cs @@ -0,0 +1,63 @@ +using System.IO.Pipelines; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTPool; + +/// +/// Adapts Minima's write API (GetMemory/GetSpan/Advance/ +/// FlushAsync) to a standard , so PipeWriter-based code +/// can write responses through the connection's per-connection slab. +/// A thin wrapper — all the work lives in Connection. +/// +public sealed class ConnectionPipeWriter : PipeWriter +{ + private readonly Connection _conn; + private bool _completed; + private bool _cancelRequested; + private long _unflushed; + + public ConnectionPipeWriter(Connection connection) + { + _conn = connection ?? throw new ArgumentNullException(nameof(connection)); + } + + public override bool CanGetUnflushedBytes => true; + public override long UnflushedBytes => _unflushed; + + public override Memory GetMemory(int sizeHint = 0) => _conn.GetMemory(sizeHint); + + public override Span GetSpan(int sizeHint = 0) => _conn.GetSpan(sizeHint); + + public override void Advance(int bytes) + { + _unflushed += bytes; + _conn.Advance(bytes); + } + + public override ValueTask FlushAsync(CancellationToken cancellationToken = default) + { + if (_cancelRequested) + { + _cancelRequested = false; + return new ValueTask(new FlushResult(isCanceled: true, isCompleted: _completed)); + } + + _unflushed = 0; + ValueTask inner = _conn.FlushAsync(); + + if (inner.IsCompletedSuccessfully) + return new ValueTask(new FlushResult(isCanceled: false, isCompleted: _completed)); + + return AwaitFlush(inner); + } + + private async ValueTask AwaitFlush(ValueTask inner) + { + await inner; + return new FlushResult(isCanceled: false, isCompleted: _completed); + } + + public override void CancelPendingFlush() => _cancelRequested = true; + + public override void Complete(Exception? exception = null) => _completed = true; +} diff --git a/MinimaTPool/Connection/RecvSnapshot.cs b/MinimaTPool/Connection/RecvSnapshot.cs new file mode 100644 index 0000000..58cf47c --- /dev/null +++ b/MinimaTPool/Connection/RecvSnapshot.cs @@ -0,0 +1,15 @@ +namespace MinimaTPool; + +public readonly struct RecvSnapshot +{ + public readonly long Tail; + public readonly bool IsClosed; + + public RecvSnapshot(long tail, bool isClosed) + { + Tail = tail; + IsClosed = isClosed; + } + + public static RecvSnapshot Closed() => new(0, isClosed: true); +} \ No newline at end of file diff --git a/MinimaTPool/MinimaTPool.csproj b/MinimaTPool/MinimaTPool.csproj new file mode 100644 index 0000000..1ec9d30 --- /dev/null +++ b/MinimaTPool/MinimaTPool.csproj @@ -0,0 +1,12 @@ + + + + Exe + net10.0 + enable + enable + true + MinimaTPool + + + diff --git a/MinimaTPool/Program.cs b/MinimaTPool/Program.cs new file mode 100644 index 0000000..5b44b1a --- /dev/null +++ b/MinimaTPool/Program.cs @@ -0,0 +1,178 @@ +using System.Buffers; +using System.IO.Pipelines; +using System.Text.Json; +using MinimaTPool.Utils; + +namespace MinimaTPool; + +/// +/// Multi-reactor HTTP/1.1 server using io_uring directly. Spawns N reactor +/// threads (one per CPU); each opens its own SO_REUSEPORT listener, runs its +/// own io_uring, owns its own connection map. The kernel load-balances new +/// connections across reactors. Per-connection state never crosses threads, +/// so no synchronization is needed on the hot path. +/// +internal static unsafe class Program +{ + internal static ReadOnlySpan Response => + "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nContent-Length: 2\r\n\r\nok"u8; + + private static int Main() + { + // All tunables live in ServerConfig — override the defaults here. + var config = new ServerConfig() + { + UsePipe = false, + ReactorCount = 10 + }; + + Console.WriteLine($"[Minima] starting {config.ReactorCount} reactors on port {config.Port} (incremental={config.Incremental})"); + + var threads = new Thread[config.ReactorCount]; + for (var i = 0; i < config.ReactorCount; i++) + { + var reactor = new Reactor(i, config); + + threads[i] = new Thread(reactor.Run) + { + Name = $"reactor-{i}", + IsBackground = false + }; + threads[i].Start(); + } + + foreach (var t in threads) + { + t.Join(); + } + + return 0; + } +} + +internal static class Handler +{ + // Real async-work knob: serialize an in-memory object of WORK_ITEMS elements to JSON + // on the THREAD POOL (via Task.Run) per request. 0 / unset = disabled (pure inline + // reactor path). Genuine CPU + allocation, not a busy-spin. + private static readonly int WorkItems = 1000; + + private static readonly Payload LargeObject = BuildPayload(Math.Max(WorkItems, 1)); + + private static Payload BuildPayload(int count) + { + var items = new Item[count]; + for (int i = 0; i < count; i++) + { + items[i] = new Item(i, $"item-{i}", i * 1.5, (i & 1) == 0, $"category-{i % 8}"); + } + return new Payload(DateTime.UtcNow.ToString("O"), count, items); + } + + public static async Task HandleAsync(Reactor reactor, Connection conn) + { + try + { + while (true) + { + RecvSnapshot snap = await conn.ReadAsync(); + + while (conn.TryGetItem(snap, out SpscRecvRing.Item item)) + { + if (item.HasBuffer) + { + UnmanagedMemoryManager mem = item.AsMemoryManager(); + ReadOnlyMemory data = mem.Memory; + // data is now usable with any BCL Memory/async API + _ = data.Length; + + // Cross-thread safe and mode-agnostic: routes to the + // shared-ring return or the incremental refcounted return. + conn.ReturnBuffer(in item); + } + } + + _ = await Task.Run(static () => JsonSerializer.Serialize("Hello World!")); + + // Real async work: serialize a large object to JSON on the THREAD POOL. + // The handler resumes OFF-REACTOR, so the FlushAsync below pays the eventfd + // handoff the pure-inline path avoids — and the serialization is genuine + // CPU + GC pressure on the pool, not a busy-spin. + /*if (WorkItems > 0) + { + _ = await Task.Run(static () => JsonSerializer.SerializeToUtf8Bytes(LargeObject)); + //JsonSerializer.SerializeToUtf8Bytes(LargeObject); + }*/ + + // One response per recv burst — accumulate in the connection's + // per-connection write slab, then submit and await ack. + conn.Write(Program.Response); + await conn.FlushAsync(); + + if (snap.IsClosed) + { + // Reactor already owns teardown (Connections.Remove + close + // happens in Dispatch's recv-error branch); we just exit. + return; + } + + conn.ResetRead(); + } + } + catch (Exception ex) + { + Console.Error.WriteLine($"[r{reactor.Id}] handler crash on fd={conn.ClientFd}: {ex}"); + // Reactor will clean the connection up via the recv-error path + // (or SPSC overflow) on the next CQE for this fd. + } + finally + { + conn.DecRef(); // release the handler's ref; teardown runs once the reactor releases too + } + } + + // PipeReader/PipeWriter variant — same behavior, driven through the BCL + // pipe adapters instead of the raw ReadAsync/TryGetItem/Write API. + public static async Task HandlePipeAsync(Reactor reactor, Connection conn) + { + var reader = new ConnectionPipeReader(conn); + var writer = new ConnectionPipeWriter(conn); + + try + { + while (true) + { + ReadResult read = await reader.ReadAsync(); + ReadOnlySequence buffer = read.Buffer; + + if (!buffer.IsEmpty) + { + // A real server would parse requests out of `buffer` here. + writer.Write(Program.Response); + await writer.FlushAsync(); + } + + // Consume everything we got; AdvanceTo returns the recv buffers. + reader.AdvanceTo(buffer.End); + + if (read.IsCompleted) + { + break; + } + } + } + catch (Exception ex) + { + Console.Error.WriteLine($"[r{reactor.Id}] pipe handler crash on fd={conn.ClientFd}: {ex}"); + } + finally + { + reader.Complete(); + writer.Complete(); + conn.DecRef(); + } + } +} + +internal sealed record Item(int Id, string Name, double Value, bool Active, string Category); +internal sealed record Payload(string Generated, int Count, Item[] Items); diff --git a/MinimaTPool/Reactor/Reactor.Incremental.cs b/MinimaTPool/Reactor/Reactor.Incremental.cs new file mode 100644 index 0000000..b0dfbe2 --- /dev/null +++ b/MinimaTPool/Reactor/Reactor.Incremental.cs @@ -0,0 +1,306 @@ +using System.Runtime.InteropServices; +using MinimaTPool.Utils; +using static MinimaTPool.Native; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTPool; + +/// +/// Incremental-buffer (IOU_PBUF_RING_INC) path. Each connection gets its own +/// buffer ring: one buffer accumulates that connection's byte stream across many +/// recvs, so buffers are recycled only when the kernel is done appending AND the +/// handler has returned every slice it was handed. Selected per reactor by the +/// `_incremental` flag; the shared-ring path in Reactor.cs is untouched. +/// +public sealed unsafe partial class Reactor +{ + private Stack? _freeGids; + private Mpsc? _returnQInc; + + private void InitIncremental() + { + // Per-connection rings; no shared ring. GID 1 reserved; per-conn GIDs 2..MaxConnections+1. + _freeGids = new Stack(MaxConnections); + for (int g = MaxConnections + 1; g >= 2; g--) + _freeGids.Push((ushort)g); + + _returnQInc = new Mpsc(1 << 16); + } + + private ushort AllocGid() => _freeGids!.Pop(); + private void FreeGid(ushort gid) => _freeGids!.Push(gid); + + // ========================================================================= + // Per-connection ring lifecycle + // ========================================================================= + + private void SetupConnectionBufRing(Connection conn) + { + ushort gid = AllocGid(); + int entries = ConnBufRingEntries; + + // Ring control area + slab + tracking arrays are allocated once and + // reused across pool lives; only the kernel registration is per-life. + if (conn.BufRing == null) + conn.BufRing = (byte*)NativeMemory.AlignedAlloc((nuint)entries * 16, 4096); + NativeMemory.Clear(conn.BufRing, (nuint)entries * 16); + + if (conn.BufSlab == null) + conn.BufSlab = (byte*)NativeMemory.AlignedAlloc((nuint)entries * (nuint)IncRecvBufferSize, 64); + + conn.CumOffset ??= new int[entries]; + conn.RefCount ??= new int[entries]; + conn.KernelDone ??= new bool[entries]; + Array.Clear(conn.CumOffset, 0, entries); + Array.Clear(conn.RefCount, 0, entries); + Array.Clear(conn.KernelDone, 0, entries); + + var reg = new io_uring_buf_reg + { + ring_addr = (ulong)conn.BufRing, + ring_entries = (uint)entries, + bgid = gid, + flags = IOU_PBUF_RING_INC, + }; + int ret = io_uring_register(Ring.Fd, IORING_REGISTER_PBUF_RING, ®, 1); + if (ret < 0) + throw new InvalidOperationException($"register pbuf_ring (inc) failed: ret={ret} gid={gid}"); + + conn.Bgid = gid; + conn.BufRingEntries = entries; + conn.BufRingMask = (uint)(entries - 1); + conn.IncrementalMode = true; + + for (ushort bid = 0; bid < entries; bid++) + { + byte* slot = conn.BufRing + (uint)bid * 16; + *(ulong*)(slot + 0) = (ulong)(conn.BufSlab + bid * (nuint)IncRecvBufferSize); + *(uint*)(slot + 8) = IncRecvBufferSize; + *(ushort*)(slot + 12) = bid; + } + Volatile.Write(ref *(ushort*)(conn.BufRing + 14), (ushort)entries); + } + + private void TeardownConnectionBufRing(Connection conn) + { + if (conn.IncrementalMode) + { + var reg = new io_uring_buf_reg { bgid = conn.Bgid }; + io_uring_register(Ring.Fd, IORING_UNREGISTER_PBUF_RING, ®, 1); + FreeGid(conn.Bgid); + } + // BufRing / BufSlab / arrays stay allocated for pool reuse. + } + + // Re-add a fully-consumed buffer to its connection's ring (reactor-thread only). + private void ReturnConnectionBuffer(Connection conn, ushort bid) + { + conn.CumOffset![bid] = 0; + conn.RefCount![bid] = 0; + conn.KernelDone![bid] = false; + + ushort tail = Volatile.Read(ref *(ushort*)(conn.BufRing + 14)); + byte* slot = conn.BufRing + (tail & conn.BufRingMask) * 16; + *(ulong*)(slot + 0) = (ulong)(conn.BufSlab + bid * (nuint)IncRecvBufferSize); + *(uint*)(slot + 8) = IncRecvBufferSize; + *(ushort*)(slot + 12) = bid; + Volatile.Write(ref *(ushort*)(conn.BufRing + 14), (ushort)(tail + 1)); + } + + // ========================================================================= + // Refcounted return path (handler → reactor), carrying (fd, gen, bid) + // ========================================================================= + + // (fd, gen, bid) packed into one ulong for the incremental return queue: + // fd in the high 32 bits, gen in the next 16, bid in the low 16. + private static ulong PackReturn(int fd, ushort gen, ushort bid) + => ((ulong)(uint)fd << 32) | ((ulong)gen << 16) | bid; + + private static void UnpackReturn(ulong packed, out int fd, out ushort gen, out ushort bid) + { + fd = (int)(packed >> 32); + gen = (ushort)((packed >> 16) & 0xFFFF); + bid = (ushort)(packed & 0xFFFF); + } + + public void EnqueueReturnQIncremental(int fd, ushort gen, ushort bid) + { + // Fast path: caller is the reactor thread (handler resumed inline). + if (Environment.CurrentManagedThreadId == _reactorThreadId) + { + ApplyReturnIncremental(fd, gen, bid); + return; + } + ulong packed = PackReturn(fd, gen, bid); + SpinWait sw = default; + while (!_returnQInc!.TryEnqueue(packed)) + sw.SpinOnce(); + WakeFdWrite(); + } + + private void DrainReturnQIncremental() + { + while (_returnQInc!.TryDequeue(out ulong packed)) + { + UnpackReturn(packed, out int fd, out ushort gen, out ushort bid); + ApplyReturnIncremental(fd, gen, bid); + } + } + + private void ApplyReturnIncremental(int fd, ushort gen, ushort bid) + { + if (!Connections.TryGetValue(fd, out var conn) || !conn.IncrementalMode) + { + return; // fd gone / ring already torn down + } + if ((ushort)conn.Generation != gen) + { + return; // stale return from a previous life (fd reused) + } + + conn.RefCount![bid]--; + if (conn.RefCount[bid] <= 0 && conn.KernelDone![bid]) + { + ReturnConnectionBuffer(conn, bid); + } + } + + // ========================================================================= + // Incremental reactor loop + // ========================================================================= + + private void LoopIncremental() + { + while (true) + { + DrainReturnQIncremental(); + DrainFlushQ(); + DrainRecycleQ(); + + int rc = Ring.SubmitAndWait(1); + if (rc < 0 && rc != -EINTR && rc != -EAGAIN && rc != -EBUSY) + { + Console.Error.WriteLine($"[r{Id}] io_uring_enter failed: {rc}"); + + break; + } + + uint ready = Ring.CqReady(); + for (uint i = 0; i < ready; i++) + { + DispatchIncremental(in Ring.CqeAt(i)); + } + Ring.CqAdvance(ready); + } + } + + private void DispatchIncremental(in IoUringCqe cqe) + { + ulong kind = cqe.user_data & 0xffffffff_00000000UL; + int fd = (int)(cqe.user_data & 0xffffffffUL); + bool more = (cqe.flags & IORING_CQE_F_MORE) != 0; + + if (kind == KindWake) + { + ulong drain; + read(_wakeFd, &drain, 8); + if (!more) + { + ArmWakePoll(); + } + return; + } + + if (kind == KindAccept) + { + if (cqe.res >= 0) + { + int clientFd = cqe.res; + SetNoDelay(clientFd); + Connection conn = _pool.TryPop(out var pooled) + ? pooled.SetFd(clientFd) + : new Connection(this, clientFd, _config.WriteSlabSize); + Connections[clientFd] = conn; + conn.InitRefs(); + SetupConnectionBufRing(conn); + SubmitRecvMultishot(clientFd, conn.Bgid); + + _ = _config.UsePipe + ? Handler.HandlePipeAsync(this, conn) + : Handler.HandleAsync(this, conn); + } + else + { + Console.Error.WriteLine($"[r{Id}] accept error: {cqe.res}"); + } + if (!more) + { + SubmitAcceptMultishot(); + } + } + else if (kind == KindRecv) + { + bool hasBuf = (cqe.flags & IORING_CQE_F_BUFFER) != 0; + bool bufMore = (cqe.flags & IORING_CQE_F_BUF_MORE) != 0; + ushort bid = hasBuf ? (ushort)(cqe.flags >> IORING_CQE_BUFFER_SHIFT) : (ushort)0; + + if (cqe.res <= 0) + { + // Peer EOF / recv error — the whole per-conn ring is freed in Recycle. + if (Connections.Remove(fd, out var dyingConn)) + { + dyingConn.MarkClosed(); + dyingConn.DecRef(); + } + + return; + } + + if (!Connections.TryGetValue(fd, out var conn)) + { + return; // straggler for a connection whose ring is already gone + } + + // Data lands at the buffer's running offset; the kernel keeps + // appending to this bid until the buffer is full (F_BUF_MORE clear). + byte* ptr = conn.BufSlab + (nuint)bid * (nuint)IncRecvBufferSize + (nuint)conn.CumOffset![bid]; + conn.CumOffset[bid] += cqe.res; + conn.RefCount![bid]++; + if (!bufMore || !more) + { + conn.KernelDone![bid] = true; + } + + conn.Complete(cqe.res, bid, hasBuffer: true, ptr); + + if (!more) + { + SubmitRecvMultishot(fd, conn.Bgid); + } + } + else if (kind == KindSend) + { + if (!Connections.TryGetValue(fd, out var conn)) + { + return; + } + if (cqe.res <= 0) + { + Connections.Remove(fd); + conn.MarkClosed(); + conn.DecRef(); + + return; + } + conn.WriteHead += cqe.res; + if (conn.WriteHead < conn.WriteInFlight) + { + SubmitSend(fd, conn.WriteBuffer + conn.WriteHead, (uint)(conn.WriteInFlight - conn.WriteHead)); + + return; + } + + conn.CompleteFlush(); + } + } +} diff --git a/MinimaTPool/Reactor/Reactor.cs b/MinimaTPool/Reactor/Reactor.cs new file mode 100644 index 0000000..a392198 --- /dev/null +++ b/MinimaTPool/Reactor/Reactor.cs @@ -0,0 +1,564 @@ +using System.Collections.Concurrent; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using MinimaTPool.Utils; +using static MinimaTPool.Native; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTPool; + +/// +/// One reactor = one thread + one io_uring + one listening socket (SO_REUSEPORT) +/// + one connection map. The reactor thread is the sole writer of the SQ ring, +/// the kernel-shared buf_ring, and the connection map. Handlers may run on any +/// thread (e.g. resumed by a thread-pool timer after `await Task.Delay(1)`); +/// they reach the reactor only through two MPSC queues (`_returnQ`, `_flushQ`) +/// woken by an `eventfd` registered as a multishot poll in the ring. +/// +public sealed unsafe partial class Reactor +{ + public readonly int Id; + public Ring Ring = null!; // created on the reactor's own thread (DEFER_TASKRUN requires same-thread setup+enter) + public readonly Dictionary Connections = new(); + + private int _listenFd; + private readonly ServerConfig _config; + private readonly ushort _port; + private readonly uint _ringEntries; + private readonly bool _incremental; + private readonly uint RecvBufferSize; + + // CQE user_data layout: kind tag in the high 32 bits, fd in the low 32. + private const ulong KindAccept = 1UL << 32; + private const ulong KindRecv = 2UL << 32; + private const ulong KindSend = 3UL << 32; + private const ulong KindWake = 4UL << 32; // eventfd-based cross-thread wake + + // Provided-buffer ring (one per reactor, shared by all its connections). + private const ushort BgId = 1; + private readonly uint BufferRingEntries; // power of two + private byte* _bufRing; // io_uring_buf_ring (kernel-shared) + private byte* _bufSlab; // contiguous slab of recv buffers + private uint _bufRingMask; + private ushort _bufRingTail; + + // Cross-thread wake mechanism: handlers running off-reactor enqueue work + // into these MPSC queues and `eventfd_write` _wakeFd; a multishot poll on + // _wakeFd registered with the ring delivers a CQE that wakes the reactor. + // When the caller is already the reactor thread (the common case — handler + // resumed inline from an IVTS SetResult), the Enqueue* methods bypass + // the queue and call the direct op, avoiding 2 syscalls per request. + private int _wakeFd; + private int _reactorThreadId; + private readonly Mpsc _returnQ = new(1 << 14); // 16384 slots + private readonly Mpsc _flushQ = new(1 << 12); // 4096 slots + + // Teardown handoff: when a connection's refcount hits 0 off-reactor (handler exited + // on the thread pool), the recycle must run on the reactor (it touches the buf_ring + // and the reactor-only pool). Connection is a ref type, so this is a ConcurrentQueue + // rather than the unmanaged Mpsc. + private readonly ConcurrentQueue _recycleQ = new(); + + // Connection pool. Reactor-thread-only — accept and teardown both run on + // this reactor, so a plain Stack is sufficient (no MPMC primitive + // needed). PoolMax caps the slab footprint per reactor: + // PoolMax × WriteSlabSize × ReactorCount = total reserved native memory. + private readonly int PoolMax; + private readonly Stack _pool; + + // Incremental-mode (IOU_PBUF_RING_INC) sizing. Each connection gets its own + // ring, so reserved native memory is bounded by: + // PoolMax × ConnBufRingEntries × IncRecvBufferSize × ReactorCount. + // Keep entries small — the point of incremental is that one buffer holds + // many reads, so you need few of them per connection. + private readonly int MaxConnections; // GID cap (one bgid per active connection) + private readonly int ConnBufRingEntries; // buffers per connection ring + private readonly uint IncRecvBufferSize; // bytes per buffer (filled incrementally) + + // Transient io_uring_enter errnos (Linux): interrupted, would-block, busy. + private const int EINTR = 4; + private const int EAGAIN = 11; + private const int EBUSY = 16; + + public Reactor(int id, ServerConfig config) + { + Id = id; + _config = config; + _port = config.Port; + _ringEntries = config.RingEntries; + _incremental = config.Incremental; + RecvBufferSize = (uint)config.RecvBufferSize; + BufferRingEntries = (uint)config.BufferRingEntries; + PoolMax = config.PoolMax; + MaxConnections = config.MaxConnections; + ConnBufRingEntries = config.ConnBufRingEntries; + IncRecvBufferSize = (uint)config.IncRecvBufferSize; + _pool = new Stack(config.PoolMax); + } + + // ========================================================================= + // Buffer ring + // ========================================================================= + + private void InitBufferRing() + { + nuint ringBytes = (nuint)BufferRingEntries * 16; + _bufRing = (byte*)NativeMemory.AlignedAlloc(ringBytes, 4096); + NativeMemory.Clear(_bufRing, ringBytes); + + nuint slabBytes = BufferRingEntries * (nuint)RecvBufferSize; + _bufSlab = (byte*)NativeMemory.AlignedAlloc(slabBytes, 64); + + _bufRingMask = BufferRingEntries - 1; + + var reg = new io_uring_buf_reg { + ring_addr = (ulong)_bufRing, + ring_entries = BufferRingEntries, + bgid = BgId, + }; + + int ret = io_uring_register(Ring.Fd, IORING_REGISTER_PBUF_RING, ®, 1); + if (ret < 0) + { + int err = Marshal.GetLastPInvokeError(); + + throw new InvalidOperationException($"register pbuf_ring failed: ret={ret} errno={err}"); + } + + // Populate every slot once. Slot 0 overlaps with the ring's tail field + // at offset 14, but we only write addr/len/bid (offsets 0..13) so tail + // stays at zero until we set it explicitly. + for (ushort bid = 0; bid < BufferRingEntries; bid++) { + byte* slot = _bufRing + (uint)bid * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)RecvBufferSize); + *(uint*)(slot + 8) = RecvBufferSize; + *(ushort*)(slot + 12) = bid; + } + _bufRingTail = (ushort)BufferRingEntries; + + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + + // Reactor-thread-only: writes the kernel-shared buf_ring tail directly. + // Off-reactor callers must use EnqueueReturnQ instead. + internal void ReturnBufferDirect(ushort bid) + { + byte* slot = _bufRing + (_bufRingTail & _bufRingMask) * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)RecvBufferSize); + *(uint*)(slot + 8) = RecvBufferSize; + *(ushort*)(slot + 12) = bid; + _bufRingTail++; + + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + + // ========================================================================= + // Cross-thread entry points (safe to call from any thread) + // ========================================================================= + + public void EnqueueReturnQ(ushort bid) + { + // Fast path: caller is the reactor thread (handler running inline from + // an IVTS SetResult). Go straight to the buf_ring — no queue, no syscall. + if (Environment.CurrentManagedThreadId == _reactorThreadId) + { + ReturnBufferDirect(bid); + return; + } + SpinWait sw = default; + while (!_returnQ.TryEnqueue(bid)) + { + sw.SpinOnce(); + } + //WakeFdWrite(); + } + + internal void EnqueueFlush(int fd) + { + // Fast path: caller is the reactor thread; write the SQE directly. + if (Environment.CurrentManagedThreadId == _reactorThreadId) + { + if (Connections.TryGetValue(fd, out var conn)) + { + SubmitSend(fd, conn.WriteBuffer, (uint)conn.WriteInFlight); + } + return; + } + SpinWait sw = default; + while (!_flushQ.TryEnqueue(fd)) + { + sw.SpinOnce(); + } + WakeFdWrite(); + } + + private void WakeFdWrite() + { + ulong v = 1; + // 8-byte write to eventfd increments its counter; the kernel marks the + // fd readable, which fires our registered multishot poll's next CQE. + write(_wakeFd, &v, 8); + } + + private void DrainReturnQ() + { + while (_returnQ.TryDequeue(out ushort bid)) + { + ReturnBufferDirect(bid); + } + } + + private void DrainFlushQ() + { + while (_flushQ.TryDequeue(out int fd)) + { + if (!Connections.TryGetValue(fd, out var conn)) + { + continue; + } + // Connection state was set by FlushAsync; the Enqueue/Dequeue pair + // establishes the happens-before so WriteInFlight is visible here. + SubmitSend(fd, conn.WriteBuffer, (uint)conn.WriteInFlight); + } + } + + // Called by Connection.DecRef when the refcount hits 0. Teardown must run on the + // reactor (buf_ring + pool are reactor-owned), so off-reactor callers hand off. + internal void EnqueueRecycle(Connection conn) + { + if (Environment.CurrentManagedThreadId == _reactorThreadId) + { + Recycle(conn, conn.ClientFd); + return; + } + _recycleQ.Enqueue(conn); + WakeFdWrite(); + } + + private void DrainRecycleQ() + { + while (_recycleQ.TryDequeue(out Connection? conn)) + { + Recycle(conn, conn.ClientFd); + } + } + + private void ArmWakePoll() + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_POLL_ADD; + sqe->fd = _wakeFd; + sqe->op_flags = POLLIN; // poll32_events lives at this offset + sqe->len = IORING_POLL_ADD_MULTI; // multishot — stays armed across CQEs + sqe->user_data = KindWake | (uint)_wakeFd; + } + + // ========================================================================= + // Main loop + // ========================================================================= + + public void Run() + { + _reactorThreadId = Environment.CurrentManagedThreadId; + + Ring = Ring.Create(_ringEntries); + _listenFd = OpenReusePortListener(_port); + + if (_incremental) + { + InitIncremental(); + } + else + { + InitBufferRing(); + } + + _wakeFd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (_wakeFd < 0) + { + throw new InvalidOperationException("eventfd failed"); + } + + Console.WriteLine($"[r{Id}] listening on 0.0.0.0:{_port} (incremental={_incremental})"); + SubmitAcceptMultishot(); + ArmWakePoll(); + + if (_incremental) + { + LoopIncremental(); + } + else + { + LoopShared(); + } + + close(_listenFd); + close(_wakeFd); + Ring.Dispose(); + } + + private void LoopShared() + { + while (true) + { + // Drain MPSC queues from off-reactor handlers. Cheap when empty. + DrainReturnQ(); + DrainFlushQ(); + DrainRecycleQ(); + + int rc = Ring.SubmitAndWait(1); + if (rc < 0 && rc != -EINTR && rc != -EAGAIN && rc != -EBUSY) + { + Console.Error.WriteLine($"[r{Id}] io_uring_enter failed: {rc}"); + break; + } + + uint ready = Ring.CqReady(); + for (uint i = 0; i < ready; i++) + { + Dispatch(in Ring.CqeAt(i)); + } + Ring.CqAdvance(ready); + } + } + + private void Dispatch(in IoUringCqe cqe) + { + ulong kind = cqe.user_data & 0xffffffff_00000000UL; + int fd = (int)(cqe.user_data & 0xffffffffUL); + bool more = (cqe.flags & IORING_CQE_F_MORE) != 0; + + if (kind == KindWake) + { + // Drain the eventfd counter so the next write re-triggers POLLIN + // (multishot poll is edge-triggered on the user_space side). + ulong drain; + read(_wakeFd, &drain, 8); + // The actual queue drains happen at the top of the next loop + // iteration — nothing else to do here. + if (!more) + { + ArmWakePoll(); + } + return; + } + + if (kind == KindAccept) + { + if (cqe.res >= 0) + { + int clientFd = cqe.res; + SetNoDelay(clientFd); + Connection conn = _pool.TryPop(out var pooled) + ? pooled.SetFd(clientFd) + : new Connection(this, clientFd, _config.WriteSlabSize); + Connections[clientFd] = conn; + conn.InitRefs(); + SubmitRecvMultishot(clientFd); + + _ = _config.UsePipe + ? Handler.HandlePipeAsync(this, conn) + : Handler.HandleAsync(this, conn); + } + else + { + Console.Error.WriteLine($"[r{Id}] accept error: {cqe.res}"); + } + // Multishot accept stays armed; only re-arm if the kernel terminated it. + if (!more) + { + SubmitAcceptMultishot(); + } + } + else if (kind == KindRecv) + { + bool hasBuf = (cqe.flags & IORING_CQE_F_BUFFER) != 0; + ushort bid = hasBuf ? (ushort)(cqe.flags >> IORING_CQE_BUFFER_SHIFT) : (ushort)0; + + if (cqe.res <= 0) + { + // Peer EOF or recv error — reactor owns teardown. + if (hasBuf) + { + ReturnBufferDirect(bid); + } + if (Connections.Remove(fd, out var dyingConn)) + { + dyingConn.MarkClosed(); // signal the handler to exit + dyingConn.DecRef(); // release the reactor's ref; teardown at refs==0 + } + return; + } + + if (!Connections.TryGetValue(fd, out var conn)) + { + // Straggler buffer for an already-closed connection. + if (hasBuf) + { + ReturnBufferDirect(bid); + } + return; + } + + byte* ptr = hasBuf ? _bufSlab + (nuint)bid * (nuint)RecvBufferSize : null; + conn.Complete(cqe.res, bid, hasBuf, ptr); + + if (!more) + { + SubmitRecvMultishot(fd); + } + } + else if (kind == KindSend) + { + if (!Connections.TryGetValue(fd, out var conn)) + { + return; + } + if (cqe.res <= 0) + { + // Send error — release the reactor's ref; teardown when the handler exits too. + Connections.Remove(fd); + conn.MarkClosed(); + conn.DecRef(); + return; + } + conn.WriteHead += cqe.res; + if (conn.WriteHead < conn.WriteInFlight) + { + // Partial send: resubmit the remainder. + SubmitSend(fd, conn.WriteBuffer + conn.WriteHead, (uint)(conn.WriteInFlight - conn.WriteHead)); + return; + } + // Full target ack'd — resets buffer state and signals the awaiter. + conn.CompleteFlush(); + } + } + + // ========================================================================= + // SQE producers (reactor-thread-only — Connection.FlushAsync hands off via + // EnqueueFlush, which DrainFlushQ turns into SubmitSend on this thread) + // ========================================================================= + + private IoUringSqe* GetSqeOrFlush() + { + IoUringSqe* sqe = Ring.GetSqe(); + if (sqe != null) + { + return sqe; + } + + Ring.SubmitAndWait(0); + sqe = Ring.GetSqe(); + + if (sqe == null) + { + throw new InvalidOperationException("SQ full after flush"); + } + + return sqe; + } + + private void SubmitAcceptMultishot() + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_ACCEPT; + sqe->ioprio = IORING_ACCEPT_MULTISHOT; + sqe->fd = _listenFd; + sqe->user_data = KindAccept | (uint)_listenFd; + } + + private void SubmitRecvMultishot(int fd) => SubmitRecvMultishot(fd, BgId); + + private void SubmitRecvMultishot(int fd, ushort bgid) + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_RECV; + sqe->flags = IOSQE_BUFFER_SELECT; + sqe->ioprio = IORING_RECV_MULTISHOT; + sqe->fd = fd; + sqe->buf_index = bgid; // buffer-group id (shared BgId, or per-conn in incremental) + sqe->user_data = KindRecv | (uint)fd; + } + + private void SubmitSend(int fd, byte* buf, uint len) + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_SEND; + sqe->fd = fd; + sqe->addr = (ulong)buf; + sqe->len = len; + sqe->user_data = KindSend | (uint)fd; + } + + private void Recycle(Connection conn, int fd) + { + // Wake awaiters, drain in-flight buffers, close the fd, reset state, + // and either push the Connection back to the pool or free its native + // WriteBuffer if the pool is full. + conn.MarkClosed(); + if (_incremental) + { + // The per-connection ring is freed wholesale; no per-buffer return. + // Clear() empties the SPSC ring (leftover slices discarded). + TeardownConnectionBufRing(conn); + } + else + { + conn.DrainRecv(); // return leftover buffers to the shared ring + } + close(fd); + conn.Clear(); + + if (_pool.Count < PoolMax) + { + _pool.Push(conn); + } + else + { + conn.Dispose(); + } + } + + // Disable Nagle on an accepted connection. Must be set per-accepted-socket, + // not on the listener — TCP_NODELAY doesn't reliably inherit across accept, + // which is why zerg/terraform/rtr all set it on the client fd, not the listener. + private static void SetNoDelay(int fd) + { + int one = 1; + setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(int)); + } + + private static int OpenReusePortListener(ushort port) + { + int fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) + { + throw new InvalidOperationException($"socket failed: {fd}"); + } + + int one = 1; + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(int)); + setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(int)); + + sockaddr_in addr = default; + addr.sin_family = AF_INET; + addr.sin_port = Htons(port); + addr.sin_addr.s_addr = 0; // 0.0.0.0 + + if (bind(fd, &addr, (uint)sizeof(sockaddr_in)) < 0) + { + throw new InvalidOperationException("bind failed"); + } + + if (listen(fd, 128) < 0) + { + throw new InvalidOperationException("listen failed"); + } + + return fd; + } +} diff --git a/MinimaTPool/ServerConfig.cs b/MinimaTPool/ServerConfig.cs new file mode 100644 index 0000000..d6dd3db --- /dev/null +++ b/MinimaTPool/ServerConfig.cs @@ -0,0 +1,35 @@ +namespace MinimaTPool; + +/// +/// All server tunables in one place — replaces the consts that used to be +/// scattered across Program.cs and Reactor.cs. Defaults match the previous +/// hardcoded values; override via object initializer in Main, e.g.: +/// new ServerConfig { Port = 9000, ReactorCount = 8, Incremental = true }. +/// +public sealed record ServerConfig +{ + // Server-level. + public ushort Port { get; init; } = 8080; + public int ReactorCount { get; init; } = 12; + + // Handler style: false = raw ReadAsync/TryGetItem loop; true = PipeReader/PipeWriter. + public bool UsePipe { get; init; } = false; + + // io_uring SQ/CQ depth. + public uint RingEntries { get; init; } = 8192; + + // Shared buffer ring (used when Incremental == false). + public int RecvBufferSize { get; init; } = 32 * 1024; + public int BufferRingEntries { get; init; } = 4096; + + // Per-connection write slab + connection pool cap. + public int WriteSlabSize { get; init; } = 16 * 1024; + public int PoolMax { get; init; } = 1024; + + // Incremental mode (IOU_PBUF_RING_INC) — per-connection rings. + // reserved native memory ≈ PoolMax × ConnBufRingEntries × IncRecvBufferSize × ReactorCount. + public bool Incremental { get; init; } = false; + public int MaxConnections { get; init; } = 4096; // GID cap (one bgid per active connection) + public int ConnBufRingEntries { get; init; } = 16; // buffers per connection ring + public int IncRecvBufferSize { get; init; } = 4096; // bytes per buffer (filled incrementally) +} diff --git a/MinimaTPool/Utils/Mpsc.cs b/MinimaTPool/Utils/Mpsc.cs new file mode 100644 index 0000000..62dbeab --- /dev/null +++ b/MinimaTPool/Utils/Mpsc.cs @@ -0,0 +1,115 @@ +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTPool.Utils; + +/// +/// Bounded lock-free multi-producer / single-consumer queue. +/// +/// Dmitry Vyukov's bounded MPMC algorithm, specialised to one consumer. +/// Power-of-two capacity, zero-allocation after construction. Producers claim a +/// slot via CAS on the enqueue position (a failed TryEnqueue on a full queue +/// leaves the position untouched — no burned tickets); the single consumer +/// advances the dequeue position with a plain write. Each slot carries a +/// sequence number that coordinates ownership between producers and consumer. +/// +/// One generic queue serves every reactor handoff: Mpsc<ushort> for buffer +/// returns, Mpsc<int> for flush fds, Mpsc<ulong> for packed incremental +/// returns. T is unmanaged so each Cell is a blittable value type with no GC refs. +/// +internal sealed class Mpsc where T : unmanaged +{ + private struct Cell + { + public long Sequence; + public T Value; + } + + private readonly Cell[] _buffer; + private readonly int _mask; + + // PaddedLong is a top-level struct (not nested here) because the CLR forbids + // explicit layout on a type nested inside a generic. + private PaddedLong _enqueuePos; + private PaddedLong _dequeuePos; + + public Mpsc(int capacityPow2) + { + if (capacityPow2 < 2 || (capacityPow2 & (capacityPow2 - 1)) != 0) + throw new ArgumentException("Capacity must be a power of two >= 2.", nameof(capacityPow2)); + + _buffer = new Cell[capacityPow2]; + _mask = capacityPow2 - 1; + + for (int i = 0; i < capacityPow2; i++) + _buffer[i].Sequence = i; + } + + /// Multi-producer safe. Returns false if the queue is full. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryEnqueue(T item) + { + Cell[] buffer = _buffer; + int mask = _mask; + + while (true) + { + long pos = Volatile.Read(ref _enqueuePos.Value); + ref Cell cell = ref buffer[(int)pos & mask]; + + long seq = Volatile.Read(ref cell.Sequence); + long dif = seq - pos; + + if (dif == 0) + { + if (Interlocked.CompareExchange(ref _enqueuePos.Value, pos + 1, pos) == pos) + { + cell.Value = item; + Volatile.Write(ref cell.Sequence, pos + 1); + return true; + } + continue; // lost the race; reload and retry + } + + if (dif < 0) + return false; // slot not yet consumed → full + } + } + + /// Single-consumer only. Returns false if empty. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeue(out T item) + { + Cell[] buffer = _buffer; + int mask = _mask; + + long pos = _dequeuePos.Value; // single consumer: plain read + ref Cell cell = ref buffer[(int)pos & mask]; + + long seq = Volatile.Read(ref cell.Sequence); + long dif = seq - (pos + 1); + + if (dif == 0) + { + item = cell.Value; + _dequeuePos.Value = pos + 1; // single consumer: plain write + Volatile.Write(ref cell.Sequence, pos + mask + 1); // free slot for producers + return true; + } + + item = default; + return false; + } +} + +/// +/// A single long padded to a 64-byte cache line so the producer and consumer +/// positions never share a line (no false sharing). Top-level and non-generic +/// so it can legally use explicit layout. +/// +[StructLayout(LayoutKind.Explicit, Size = 64)] +internal struct PaddedLong +{ + [FieldOffset(0)] public long Value; +} diff --git a/MinimaTPool/Utils/RingSegment.cs b/MinimaTPool/Utils/RingSegment.cs new file mode 100644 index 0000000..a797b3c --- /dev/null +++ b/MinimaTPool/Utils/RingSegment.cs @@ -0,0 +1,31 @@ +using System.Buffers; + +namespace MinimaTPool.Utils; + +/// +/// One segment of a multi-buffer ReadOnlySequence<byte> built by the +/// ConnectionPipeReader when a single read spans more than one recv buffer. +/// BufferId is carried for debugging; buffer return is driven off the held +/// item list, not the segments. +/// +public sealed class RingSegment : ReadOnlySequenceSegment +{ + public ushort BufferId { get; } + + public RingSegment(ReadOnlyMemory memory, ushort bufferId) + { + Memory = memory; + BufferId = bufferId; + } + + public RingSegment Append(ReadOnlyMemory memory, ushort bufferId) + { + var next = new RingSegment(memory, bufferId) + { + RunningIndex = RunningIndex + Memory.Length + }; + + Next = next; + return next; + } +} diff --git a/MinimaTPool/Utils/SpscRecvRing.cs b/MinimaTPool/Utils/SpscRecvRing.cs new file mode 100644 index 0000000..288df28 --- /dev/null +++ b/MinimaTPool/Utils/SpscRecvRing.cs @@ -0,0 +1,105 @@ +using System.Runtime.CompilerServices; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace MinimaTPool.Utils; + +public sealed unsafe class SpscRecvRing +{ + public struct Item + { + public byte* Ptr; + public ushort Bid; + public int Len; + public bool HasBuffer; + public ushort Gen; // connection generation when enqueued (incremental return guard) + + public ReadOnlySpan AsSpan() => new(Ptr, Len); + + public UnmanagedMemoryManager AsMemoryManager() => new(Ptr, Len, Bid); + } + + private readonly Item[] _items; + private readonly int _mask; + private long _tail; + private long _head; + + public SpscRecvRing(int capacityPow2) + { + if (capacityPow2 <= 0 || (capacityPow2 & (capacityPow2 - 1)) != 0) + { + throw new ArgumentException("capacity must be a power of two", nameof(capacityPow2)); + } + + _items = new Item[capacityPow2]; + _mask = capacityPow2 - 1; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryEnqueue(in Item item) + { + long head = Volatile.Read(ref _head); + long tail = _tail; + + if ((ulong)(tail - head) >= (ulong)_items.Length) + { + return false; + } + + _items[(int)(tail & _mask)] = item; + Volatile.Write(ref _tail, tail + 1); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeue(out Item item) + { + long head = _head; + long tail = Volatile.Read(ref _tail); + + if (head >= tail) + { + item = default; + return false; + } + + item = _items[(int)(head & _mask)]; + Volatile.Write(ref _head, head + 1); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long SnapshotTail() => Volatile.Read(ref _tail); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeueUntil(long tailSnapshot, out Item item) + { + long head = _head; + + if (head >= tailSnapshot) + { + item = default; + return false; + } + + item = _items[(int)(head & _mask)]; + Volatile.Write(ref _head, head + 1); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool IsEmpty() => Volatile.Read(ref _head) >= Volatile.Read(ref _tail); + + // Reactor-thread-only, called during connection teardown (Clear) when no + // handler is consuming. Discards any leftover items so the recycled + // connection starts empty. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Reset() + { + _head = 0; + _tail = 0; + } +} diff --git a/MinimaTPool/Utils/UnmanagedMemoryManager.cs b/MinimaTPool/Utils/UnmanagedMemoryManager.cs new file mode 100644 index 0000000..0c03a26 --- /dev/null +++ b/MinimaTPool/Utils/UnmanagedMemoryManager.cs @@ -0,0 +1,32 @@ +using System.Buffers; + +namespace MinimaTPool.Utils; + +public sealed unsafe class UnmanagedMemoryManager : MemoryManager +{ + private readonly byte* _ptr; + private readonly int _length; + + public ushort BufferId { get; } + + public UnmanagedMemoryManager(byte* ptr, int length) + { + _ptr = ptr; + _length = length; + } + + public UnmanagedMemoryManager(byte* ptr, int length, ushort bufferId) + { + _ptr = ptr; + _length = length; + BufferId = bufferId; + } + + public override Span GetSpan() => new(_ptr, _length); + + public override MemoryHandle Pin(int elementIndex = 0) => new(_ptr + elementIndex); + + public override void Unpin() { } + + protected override void Dispose(bool disposing) { } +} diff --git a/MinimaTPool/io_uring/Native.cs b/MinimaTPool/io_uring/Native.cs new file mode 100644 index 0000000..f6780e3 --- /dev/null +++ b/MinimaTPool/io_uring/Native.cs @@ -0,0 +1,162 @@ +using System.Runtime.InteropServices; + +namespace MinimaTPool; + +/// +/// All native interop in one file: io_uring syscalls, libc socket calls, +/// the kernel struct layouts they expect, and the constants needed to +/// drive a minimal io_uring loop. +/// +public static unsafe class Native { + private const long SYS_IO_URING_SETUP = 425; + private const long SYS_IO_URING_ENTER = 426; + private const long SYS_IO_URING_REGISTER = 427; + + public const byte IORING_OP_POLL_ADD = 6; + public const byte IORING_OP_ACCEPT = 13; + public const byte IORING_OP_SEND = 26; + public const byte IORING_OP_RECV = 27; + public const uint IORING_ENTER_GETEVENTS = 1u << 0; + public const long IORING_OFF_SQ_RING = 0; + public const long IORING_OFF_SQES = 0x10000000; + + // Multishot / buffer-ring goodies. + public const ushort IORING_ACCEPT_MULTISHOT = 1 << 0; + public const ushort IORING_RECV_MULTISHOT = 1 << 1; + public const byte IOSQE_BUFFER_SELECT = 1 << 5; + public const uint IORING_CQE_F_BUFFER = 1u << 0; + public const uint IORING_CQE_F_MORE = 1u << 1; + public const int IORING_CQE_BUFFER_SHIFT = 16; + public const uint IORING_REGISTER_PBUF_RING = 22; + public const uint IORING_UNREGISTER_PBUF_RING = 23; + public const uint IORING_POLL_ADD_MULTI = 1u << 0; + + // Incremental provided-buffer consumption (kernel 6.12+). IOU_PBUF_RING_INC + // is set in io_uring_buf_reg.flags at registration; IORING_CQE_F_BUF_MORE is + // set on recv CQEs while the kernel will keep appending to the same buffer. + public const ushort IOU_PBUF_RING_INC = 2; + public const uint IORING_CQE_F_BUF_MORE = 1u << 4; + + // eventfd flags + poll mask (used for the cross-thread wake mechanism). + public const int EFD_CLOEXEC = 0x80000; + public const int EFD_NONBLOCK = 0x800; + public const uint POLLIN = 0x0001; + + // Setup flags. SINGLE_ISSUER tells the kernel only one thread will submit + // to this ring (skips locking on the SQ). DEFER_TASKRUN defers completion + // processing until io_uring_enter(GETEVENTS), which lets the kernel batch + // work and avoids interrupting the reactor with task_work mid-flight. + public const uint IORING_SETUP_SINGLE_ISSUER = 1u << 12; + public const uint IORING_SETUP_DEFER_TASKRUN = 1u << 13; + + public const int PROT_READ = 1; + public const int PROT_WRITE = 2; + public const int MAP_SHARED = 1; + public const int MAP_POPULATE = 0x8000; + + public const int AF_INET = 2; + public const int SOCK_STREAM = 1; + public const int SOL_SOCKET = 1; + public const int SO_REUSEADDR = 2; + public const int SO_REUSEPORT = 15; + public const int IPPROTO_TCP = 6; + public const int TCP_NODELAY = 1; + + [DllImport("libc", EntryPoint = "syscall")] + private static extern long syscall3(long nr, uint a1, IoUringParams* a2); + + [DllImport("libc", EntryPoint = "syscall")] + private static extern long syscall6(long nr, uint a1, uint a2, uint a3, uint a4, void* a5, nuint a6); + + [DllImport("libc", EntryPoint = "syscall", SetLastError = true)] + private static extern long syscall4(long nr, uint a1, uint a2, void* a3, uint a4); + + public static int io_uring_setup(uint entries, IoUringParams* p) => + (int)syscall3(SYS_IO_URING_SETUP, entries, p); + + public static int io_uring_enter(int fd, uint toSubmit, uint minComplete, uint flags) => + (int)syscall6(SYS_IO_URING_ENTER, (uint)fd, toSubmit, minComplete, flags, null, 0); + + public static int io_uring_register(int fd, uint opcode, void* arg, uint nrArgs) => + (int)syscall4(SYS_IO_URING_REGISTER, (uint)fd, opcode, arg, nrArgs); + + [DllImport("libc")] public static extern void* mmap(void* addr, nuint length, int prot, int flags, int fd, long offset); + [DllImport("libc")] public static extern int munmap(void* addr, nuint length); + [DllImport("libc")] public static extern int close(int fd); + [DllImport("libc")] public static extern int socket(int domain, int type, int proto); + [DllImport("libc")] public static extern int bind(int fd, sockaddr_in* addr, uint len); + [DllImport("libc")] public static extern int listen(int fd, int backlog); + [DllImport("libc")] public static extern int setsockopt(int fd, int level, int optname, void* optval, uint optlen); + [DllImport("libc")] public static extern int eventfd(uint initval, int flags); + [DllImport("libc")] public static extern long write(int fd, void* buf, nuint count); + [DllImport("libc")] public static extern long read(int fd, void* buf, nuint count); + + public static ushort Htons(ushort x) => (ushort)((x << 8) | (x >> 8)); + + // Kernel struct layouts (must match include/uapi/linux/io_uring.h) + [StructLayout(LayoutKind.Sequential)] + public struct SqRingOffsets { + public uint head, tail, ring_mask, ring_entries, flags, dropped, array, resv1; + public ulong resv2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct CqRingOffsets { + public uint head, tail, ring_mask, ring_entries, overflow, cqes, flags, resv1; + public ulong resv2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct IoUringParams { + public uint sq_entries, cq_entries, flags, sq_thread_cpu, sq_thread_idle; + public uint features, wq_fd, resv0, resv1, resv2; + public SqRingOffsets sq_off; + public CqRingOffsets cq_off; + } + + [StructLayout(LayoutKind.Explicit, Size = 64)] + public struct IoUringSqe { + [FieldOffset(0)] public byte opcode; + [FieldOffset(1)] public byte flags; + [FieldOffset(2)] public ushort ioprio; + [FieldOffset(4)] public int fd; + [FieldOffset(8)] public ulong off; + [FieldOffset(16)] public ulong addr; + [FieldOffset(24)] public uint len; + [FieldOffset(28)] public uint op_flags; + [FieldOffset(32)] public ulong user_data; + [FieldOffset(40)] public ushort buf_index; + [FieldOffset(42)] public ushort personality; + [FieldOffset(44)] public int splice_fd_in; + [FieldOffset(48)] public ulong addr3; + [FieldOffset(56)] public ulong __pad2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct IoUringCqe { + public ulong user_data; + public int res; + public uint flags; + } + + // Argument struct for IORING_REGISTER_PBUF_RING. + [StructLayout(LayoutKind.Sequential)] + public struct io_uring_buf_reg { + public ulong ring_addr; + public uint ring_entries; + public ushort bgid; + public ushort flags; + public ulong resv1, resv2, resv3; + } + + [StructLayout(LayoutKind.Sequential)] + public struct in_addr { public uint s_addr; } + + [StructLayout(LayoutKind.Sequential)] + public unsafe struct sockaddr_in { + public ushort sin_family; + public ushort sin_port; + public in_addr sin_addr; + public fixed byte sin_zero[8]; + } +} diff --git a/MinimaTPool/io_uring/Ring.cs b/MinimaTPool/io_uring/Ring.cs new file mode 100644 index 0000000..940a486 --- /dev/null +++ b/MinimaTPool/io_uring/Ring.cs @@ -0,0 +1,179 @@ +using System.Runtime.CompilerServices; +using static MinimaTPool.Native; + +// ReSharper disable SuggestVarOrType_BuiltInTypes +// ReSharper disable SuggestVarOrType_Elsewhere +#pragma warning disable CA1806 + +namespace MinimaTPool; + +public sealed unsafe class Ring : IDisposable +{ + private int _fd; + + public int Fd => _fd; + + private uint* _sqHead; + private uint* _sqTail; + private uint* _sqArray; + private uint _sqMask; + private uint _sqEntries; + private IoUringSqe* _sqes; + + private uint* _cqHead; + private uint* _cqTail; + private IoUringCqe* _cqes; + private uint _cqMask; + + private uint _sqeTail; + + private byte* _ringPtr; + private nuint _ringSize; + private byte* _sqePtr; + private nuint _sqeSize; + + public static Ring Create(uint entries) + { + IoUringParams ioUringParams = default; + ioUringParams.flags = IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN; + int fd = io_uring_setup(entries, &ioUringParams); + if (fd < 0) + { + throw new InvalidOperationException($"io_uring_setup failed: {fd}"); + } + + var ring = new Ring + { + _fd = fd, + _sqEntries = ioUringParams.sq_entries + }; + + nuint sqRingBytes = ioUringParams.sq_off.array + ioUringParams.sq_entries * sizeof(uint); + nuint cqRingBytes = ioUringParams.cq_off.cqes + ioUringParams.cq_entries * (nuint)sizeof(IoUringCqe); + nuint ringBytes = sqRingBytes > cqRingBytes ? sqRingBytes : cqRingBytes; + + void* ringMem = mmap(null, ringBytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); + if (ringMem == (void*)-1) + { + close(fd); + + throw new InvalidOperationException("mmap(SQ_RING) failed"); + } + ring._ringPtr = (byte*)ringMem; + ring._ringSize = ringBytes; + + nuint sqeBytes = ioUringParams.sq_entries * (nuint)sizeof(IoUringSqe); + void* sqeMem = mmap(null, sqeBytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); + if (sqeMem == (void*)-1) + { + munmap(ringMem, ringBytes); + close(fd); + + throw new InvalidOperationException("mmap(SQES) failed"); + } + ring._sqes = (IoUringSqe*)sqeMem; + ring._sqePtr = (byte*)sqeMem; + ring._sqeSize = sqeBytes; + + byte* ringPointer = (byte*)ringMem; + ring._sqHead = (uint*)(ringPointer + ioUringParams.sq_off.head); + ring._sqTail = (uint*)(ringPointer + ioUringParams.sq_off.tail); + ring._sqArray = (uint*)(ringPointer + ioUringParams.sq_off.array); + ring._sqMask = *(uint*)(ringPointer + ioUringParams.sq_off.ring_mask); + + ring._cqHead = (uint*)(ringPointer + ioUringParams.cq_off.head); + ring._cqTail = (uint*)(ringPointer + ioUringParams.cq_off.tail); + ring._cqes = (IoUringCqe*)(ringPointer + ioUringParams.cq_off.cqes); + ring._cqMask = *(uint*)(ringPointer + ioUringParams.cq_off.ring_mask); + + return ring; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public IoUringSqe* GetSqe() + { + uint head = Volatile.Read(ref *_sqHead); + + if (_sqeTail - head >= _sqEntries) + { + return null; + } + + uint slot = _sqeTail & _sqMask; + _sqArray[slot] = slot; + _sqeTail++; + + return &_sqes[slot]; + } + + public int SubmitAndWait(uint waitFor) + { + uint published = *_sqTail; + uint toSubmit = _sqeTail - published; + + if (toSubmit > 0) + { + Volatile.Write(ref *_sqTail, _sqeTail); + } + + if (toSubmit == 0 && waitFor == 0) return 0; + + uint flags = waitFor > 0 ? IORING_ENTER_GETEVENTS : 0; + + return io_uring_enter(_fd, toSubmit, waitFor, flags); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryGetCqe(out IoUringCqe cqe) + { + uint head = *_cqHead; + uint tail = Volatile.Read(ref *_cqTail); + + if (head == tail) + { + cqe = default; + + return false; + } + + cqe = _cqes[head & _cqMask]; + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void CqeSeen() => Volatile.Write(ref *_cqHead, *_cqHead + 1); + + // Batched CQ drain (liburing io_uring_for_each_cqe + io_uring_cq_advance): + // read the kernel-written tail once (acquire), process the whole batch, + // then publish the consumed head once (release) instead of once per CQE. + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint CqReady() => Volatile.Read(ref *_cqTail) - *_cqHead; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ref readonly IoUringCqe CqeAt(uint i) => ref _cqes[(*_cqHead + i) & _cqMask]; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void CqAdvance(uint n) => Volatile.Write(ref *_cqHead, *_cqHead + n); + + public void Dispose() + { + if (_ringPtr != null) + { + munmap(_ringPtr, _ringSize); _ringPtr = null; + } + + if (_sqePtr != null) + { + munmap(_sqePtr, _sqeSize); _sqePtr = null; + } + + if (_fd > 0) + { + close(_fd); _fd = 0; + } + } +} + +#pragma warning restore CA1806 diff --git a/Shrike.Playground/Program.cs b/Shrike.Playground/Program.cs index 070112f..6ce3468 100644 --- a/Shrike.Playground/Program.cs +++ b/Shrike.Playground/Program.cs @@ -10,7 +10,7 @@ public static void Main() { var engine = ShrikeEngine .CreateBuilder() - .SetNWorkersSolver(() => 12) + .SetNWorkersSolver(() => 6) .SetBacklog(16384) .SetMaxEventsPerWake(512) .SetMaxNumberConnectionsPerWorker(512) @@ -23,8 +23,7 @@ public static void Main() // Same knob + object as Minima / AspBaseline / SocketBaseline: serialize a // WORK_ITEMS-element object to JSON on the THREAD POOL per request. 0/unset = inline. - private static readonly int WorkItems = - int.TryParse(Environment.GetEnvironmentVariable("WORK_ITEMS"), out int n) ? n : 0; + private static readonly int WorkItems = 1000; private static readonly Payload LargeObject = BuildPayload(Math.Max(WorkItems, 1)); @@ -64,12 +63,14 @@ private static async Task HandleAsync(Connection conn) if (wrote) { + _ = await Task.Run(static () => JsonSerializer.Serialize("Hello World!")); + // Real async work on the THREAD POOL — handler resumes off-worker. Shrike's // FlushAsync does a thread-safe send() directly (epoll), so no handoff here. - if (WorkItems > 0) + /*if (WorkItems > 0) { _ = await Task.Run(static () => JsonSerializer.SerializeToUtf8Bytes(LargeObject)); - } + }*/ await conn.FlushAsync(); } @@ -80,14 +81,14 @@ private static async Task HandleAsync(Connection conn) private static unsafe void CommitPlainTextResponse(Connection connection) { - int tail = connection.WriteBuffer.Tail; - int contentLength = s_plainTextBody.Length; + //int tail = connection.WriteBuffer.Tail; + //int contentLength = s_plainTextBody.Length; connection.WriteBuffer.WriteUnmanaged("HTTP/1.1 200 OK\r\n"u8 + - "Content-Length: \r\n"u8 + + "Content-Length: 13\r\n"u8 + "Server: S\r\n"u8 + - "Content-Type: text/plain\r\n"u8); - connection.WriteBuffer.WriteUnmanaged(DateHelper.HeaderBytes); + "Content-Type: text/plain\r\n\r\nHello, World!"u8); + /*connection.WriteBuffer.WriteUnmanaged(DateHelper.HeaderBytes); connection.WriteBuffer.WriteUnmanaged(s_plainTextBody); // Patch the 2-digit Content-Length into the reserved spaces (offset matches the header above). @@ -95,7 +96,7 @@ private static unsafe void CommitPlainTextResponse(Connection connection) int tens = contentLength / 10; int ones = contentLength - tens * 10; dst[0] = (byte)('0' + tens); - dst[1] = (byte)('0' + ones); + dst[1] = (byte)('0' + ones);*/ } } diff --git a/Shrike/Engine/Connection.cs b/Shrike/Engine/Connection.cs index 7dfd86e..9ef4846 100644 --- a/Shrike/Engine/Connection.cs +++ b/Shrike/Engine/Connection.cs @@ -39,13 +39,13 @@ public enum FlushResult { Complete, Incomplete, Close } public int Ep; // ---- read IVTS (result = isClosed) ---- - private ManualResetValueTaskSourceCore _readSignal = new() { RunContinuationsAsynchronously = false }; + private ManualResetValueTaskSourceCore _readSignal = new() { RunContinuationsAsynchronously = true }; private int _armed; private int _pending; private int _closed; // ---- flush IVTS ---- - private ManualResetValueTaskSourceCore _flushSignal = new() { RunContinuationsAsynchronously = false }; + private ManualResetValueTaskSourceCore _flushSignal = new() { RunContinuationsAsynchronously = true }; private int _flushArmed; public Connection(int maxConnections, int inSlabSize, int outSlabSize) diff --git a/Shrike/Writers/FixedBufferWriter.cs b/Shrike/Writers/FixedBufferWriter.cs index efdf9e2..dd83397 100644 --- a/Shrike/Writers/FixedBufferWriter.cs +++ b/Shrike/Writers/FixedBufferWriter.cs @@ -26,47 +26,13 @@ namespace Shrike; [SkipLocalsInit] public unsafe class FixedBufferWriter : IUnmanagedBufferWriter, IBufferWriter, IDisposable { - // ========================================================================= - // Fields - // ========================================================================= - - /// - /// The total capacity (in bytes) of the memory region represented by this writer. - /// private readonly int _capacity; - private readonly UnmanagedMemoryManager _manager; - - /// - /// The current read position (if the buffer is also reused for reads). - /// Not used by the writer itself, but exposed for external control. - /// public int Head; - - /// - /// The current write position. Bytes have been written in [0 .. Tail). - /// public int Tail; - - /// - /// Pointer to the beginning of the unmanaged buffer. - /// + public byte* Ptr { get; } - - // ========================================================================= - // Constructor - // ========================================================================= - - /// - /// Creates a new instance over an unmanaged - /// memory region. - /// - /// must point to a memory block of at least - /// bytes that remains valid for the lifetime - /// of this struct. - /// - /// Pointer to the start of the unmanaged buffer. - /// Maximum number of bytes writable to the buffer. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public FixedBufferWriter(byte* ptr, int capacity) { @@ -77,28 +43,14 @@ public FixedBufferWriter(byte* ptr, int capacity) _manager = new UnmanagedMemoryManager(ptr, capacity); } - - // ========================================================================= - // Core Methods - // ========================================================================= - - /// - /// Resets both read () and write () - /// indices to zero, effectively clearing the buffer (logically). - /// - /// Does not modify the underlying memory — only the pointers. - /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public void Reset() { Head = 0; Tail = 0; } - - /// - /// Advances the write pointer by bytes after data - /// has been written directly into the memory region. - /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public void Advance(int count) { @@ -110,55 +62,39 @@ public Memory GetMemory(int sizeHint = 0) { int remaining = _capacity - Tail; if (sizeHint > remaining) + { throw new InvalidOperationException("Buffer too small."); + } return _manager.Memory.Slice(Tail, remaining); } - /// - /// Gets a raw unmanaged pointer to the start of the buffer. - /// This is mainly for interop or direct native I/O operations (e.g. send()). - /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public byte* GetPointer() => Ptr; - - /// - /// Returns a writable over the remaining space in - /// the buffer, starting at the current position. - /// - /// Throws if the requested - /// would exceed the buffer capacity. - /// - /// The minimum required size for the writable region. + public Span GetSpan(int sizeHint = 0) { if (Tail + sizeHint > _capacity) + { throw new InvalidOperationException("Buffer too small."); + } return new Span(Ptr + Tail, _capacity - Tail); } - - // ========================================================================= - // Write Helpers - // ========================================================================= - - /// - /// Copies unmanaged data directly into the buffer using a raw pointer copy. - /// Slightly faster than for large spans because it avoids - /// intermediate range checks. - /// - /// The caller must ensure does not overlap the target region. - /// - /// Data to copy into the buffer. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public void WriteUnmanaged(ReadOnlySpan source) { int len = source.Length; if (Tail + len > _capacity) + { throw new InvalidOperationException("Buffer too small."); + } fixed (byte* src = source) + { Buffer.MemoryCopy(src, Ptr + Tail, _capacity - Tail, len); + } Tail += len; } @@ -170,40 +106,24 @@ public void WriteUnmanaged(string source) Tail += bytesWritten; } - /// - /// Copies data from a managed into the unmanaged buffer. - /// This version uses which performs bounds checks - /// and is safe for managed callers. - /// - /// The data to copy into the buffer. [MethodImpl(MethodImplOptions.AggressiveInlining)] public void Write(ReadOnlySpan source) { int len = source.Length; if (Tail + len > _capacity) + { throw new InvalidOperationException("Buffer too small."); + } source.CopyTo(new Span(Ptr + Tail, _capacity - Tail)); Tail += len; } - // ========================================================================= - // Disposal - // ========================================================================= - - /// - /// Releases the unmanaged memory associated with this writer if it owns the pointer. - /// - /// If points to a shared memory region (e.g. part of a - /// connection pool or slab allocator), calling this will free that memory - /// globally — causing use-after-free crashes for other users. - /// - /// Only call this when you know this instance *owns* the buffer and no one else - /// references it. - /// public void Dispose() { if (Ptr != null) + { NativeMemory.AlignedFree(Ptr); + } } } \ No newline at end of file diff --git a/SocketBaseline/Program.cs b/SocketBaseline/Program.cs index cd20f3d..35ee6b6 100644 --- a/SocketBaseline/Program.cs +++ b/SocketBaseline/Program.cs @@ -5,7 +5,7 @@ // Raw System.Net.Sockets HTTP/1.1 server — NO ASP.NET, NO Kestrel. A single async accept // loop; each connection is handled on the thread pool via the runtime's async socket engine // (epoll-backed on Linux). Same WORK_ITEMS knob + same object as Minima / AspBaseline. -int workItems = 50; +int workItems = 1000; Payload largeObject = BuildPayload(Math.Max(workItems, 1)); byte[] response = "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nContent-Length: 2\r\n\r\nok"u8.ToArray(); @@ -32,14 +32,16 @@ async Task HandleAsync(Socket client) { int read = await client.ReceiveAsync(buf.AsMemory(), SocketFlags.None); if (read <= 0) break; // peer closed + + _ = await Task.Run(static () => JsonSerializer.Serialize("Hello World!")); // Same work as Minima/AspBaseline: serialize the object on the thread pool // (the handler already runs there) and discard. WORK_ITEMS=0 → plain "ok". - if (workItems > 0) + /*if (workItems > 0) { // Force threadpool _ = await Task.Run(() => JsonSerializer.SerializeToUtf8Bytes(largeObject)); - } + }*/ int sent = 0; while (sent < response.Length) diff --git a/Spring.Demo/Program.cs b/Spring.Demo/Program.cs index 16ff3d1..cf0f74e 100644 --- a/Spring.Demo/Program.cs +++ b/Spring.Demo/Program.cs @@ -10,14 +10,10 @@ kestrel.ListenAnyIP(8080); }); -// SPRING=0 → Kestrel's default Socket transport (baseline). Otherwise the io_uring Spring transport. -if (Environment.GetEnvironmentVariable("SPRING") != "0") -{ - builder.WebHost.UseSpring(opts => opts.ReactorCount = Math.Max(1, 12)); -} +builder.WebHost.UseSpring(opts => opts.ReactorCount = Math.Max(1, 12)); var app = builder.Build(); -app.MapGet("/", () => "Hello from Spring + Kestrel\n"); +app.MapGet("/", () => "Hello, World!\n"); app.Run(); diff --git a/docs/blog/blog.css b/docs/blog/blog.css index 4c77021..566414a 100644 --- a/docs/blog/blog.css +++ b/docs/blog/blog.css @@ -1,14 +1,14 @@ :root { - --bg: #0a0a0f; - --bg-card: #12121a; - --bg-code: #1a1a2e; - --border: #1e1e30; - --text: #e0e0e8; - --text-muted: #8888a0; - --accent-zerg: #7c5cfc; - --accent-zerg-dim: #5a3ed8; - --accent-terraform: #00c896; - --accent-hot: #ff6b6b; + --bg: #eef1f5; + --bg-card: #ffffff; + --bg-code: #f4f7fb; + --border: #d4dbe5; + --text: #1a2540; + --text-muted: #5b6679; + --accent-zerg: #1e40af; + --accent-zerg-dim: #2563eb; + --accent-terraform: #0d9488; + --accent-hot: #b91c1c; --font-mono: 'JetBrains Mono', 'Fira Code', 'Cascadia Code', 'Consolas', monospace; --font-sans: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; } @@ -29,7 +29,7 @@ nav { top: 0; width: 100%; z-index: 100; - background: rgba(10, 10, 15, 0.85); + background: rgba(238, 241, 245, 0.85); backdrop-filter: blur(16px); border-bottom: 1px solid var(--border); padding: 0 2rem; @@ -186,7 +186,7 @@ nav .links a.active { color: var(--accent-zerg); } .post-body a { color: var(--accent-zerg); text-decoration: none; - border-bottom: 1px solid rgba(124, 92, 252, 0.3); + border-bottom: 1px solid rgba(30, 64, 175, 0.3); transition: border-color 0.2s; } .post-body a:hover { border-bottom-color: var(--accent-zerg); } @@ -194,7 +194,7 @@ nav .links a.active { color: var(--accent-zerg); } font-family: var(--font-mono); font-size: 0.88em; color: var(--accent-zerg); - background: rgba(124, 92, 252, 0.1); + background: rgba(30, 64, 175, 0.1); padding: 0.1em 0.35em; border-radius: 4px; } @@ -228,28 +228,28 @@ nav .links a.active { color: var(--accent-zerg); } .token.comment, .token.prolog, .token.doctype, -.token.cdata { color: #546e7a; font-style: italic; } +.token.cdata { color: #7d8590; font-style: italic; } .token.punctuation { color: var(--text); } .token.namespace { opacity: 0.75; } .token.keyword, .token.boolean, .token.constant, -.token.symbol { color: #c792ea; } -.token.number { color: #f78c6c; } +.token.symbol { color: #1d4ed8; } +.token.number { color: #9a3412; } .token.string, .token.char, .token.attr-value, -.token.regex { color: #c3e88d; } +.token.regex { color: #1a7f37; } .token.class-name, -.token.builtin { color: #82aaff; } -.token.function { color: #f0c674; } +.token.builtin { color: #0550ae; } +.token.function { color: #1e40af; } .token.operator, .token.entity, -.token.url { color: #89ddff; } +.token.url { color: #0e7490; } .token.variable, .token.parameter { color: var(--text); } -.token.attr-name { color: #82aaff; } -.token.preprocessor { color: #c792ea; opacity: 0.85; } +.token.attr-name { color: #0550ae; } +.token.preprocessor { color: #1d4ed8; opacity: 0.9; } .post-body ul { margin: 1.25rem 0; padding-left: 0.5rem; @@ -270,6 +270,81 @@ nav .links a.active { color: var(--accent-zerg); } font-weight: bold; } .post-body strong { color: var(--text); font-weight: 700; } +.post-body .note { + border-left: 3px solid var(--accent-zerg); + background: rgba(30, 64, 175, 0.04); + padding: 0.75rem 1rem; + margin: 1.25rem 0; + color: var(--text-muted); + font-style: italic; + border-radius: 0 6px 6px 0; +} + +/* TABLES */ +.post-body .table-scroll { + overflow-x: auto; + margin: 1.5rem 0; + border: 1px solid var(--border); + border-radius: 8px; + background: var(--bg-card); +} +.post-body .table-scroll table { + width: 100%; + border-collapse: collapse; + font-family: var(--font-mono); + font-size: 0.85rem; +} +.post-body table th, +.post-body table td { + text-align: left; + padding: 0.6rem 0.9rem; + border-bottom: 1px solid var(--border); + white-space: nowrap; +} +.post-body table thead th { + color: var(--accent-zerg); + font-weight: 700; + background: rgba(30, 64, 175, 0.08); + border-bottom: 1px solid var(--border); +} +.post-body table tbody tr:last-child td { border-bottom: none; } +.post-body table tbody tr:hover { background: rgba(30, 64, 175, 0.04); } + +/* Wide table variant: bleeds beyond the 760px article column on large screens */ +.post-body .table-scroll.table-wide { + margin-left: -260px; + margin-right: -260px; +} +.post-body .table-scroll.table-wide table th, +.post-body .table-scroll.table-wide table td { + padding: 0.5rem 0.55rem; + font-size: 0.8rem; +} +.post-body table th.col-group { + text-align: center; + border-right: 1px solid var(--border); + border-left: 1px solid var(--border); +} +.post-body table th.col-group-end, +.post-body table td.col-group-end { border-right: 1px solid var(--border); } +@media (max-width: 1400px) { + .post-body .table-scroll.table-wide { + margin-left: -120px; + margin-right: -120px; + } +} +@media (max-width: 1024px) { + .post-body .table-scroll.table-wide { + margin-left: -40px; + margin-right: -40px; + } +} +@media (max-width: 768px) { + .post-body .table-scroll.table-wide { + margin-left: 0; + margin-right: 0; + } +} /* GLOSSARY */ .glossary { diff --git a/docs/blog/images/part-6-hero.png b/docs/blog/images/part-6-hero.png new file mode 100644 index 0000000..74d4cc2 Binary files /dev/null and b/docs/blog/images/part-6-hero.png differ diff --git a/docs/blog/index.html b/docs/blog/index.html index 9e7cf21..626fd28 100644 --- a/docs/blog/index.html +++ b/docs/blog/index.html @@ -25,6 +25,13 @@

Blog

Notes on io_uring, low-level networking and the internals of zerg.

    +
  • + + +
    C# Networking Deep Dive With io_uring — Part 6: Numbers
    +
    Six implementations on the same i9 14900k box, two workloads (sync plaintext "OK" and an async Task.Run doing a trivial JsonSerializer.Serialize), measured with wrk and gcannon. io_uring r+w with IVTS inline tops the sync chart at 3.95M req/s but turns very unstable under async; once the handler leaves the reactor, the io_uring/threadpool variant drops to ~2.4M while epoll/threadpool and Kestrel's stock socket both hold ~3.6M sync / ~3.0M async. A hybrid io_uring recv + libc send lands in the middle. Includes a comparison-at-a-glance table.
    +
    +
  • diff --git a/docs/blog/io-uring-minima-part-6.html b/docs/blog/io-uring-minima-part-6.html new file mode 100644 index 0000000..a45d52e --- /dev/null +++ b/docs/blog/io-uring-minima-part-6.html @@ -0,0 +1,470 @@ + + + + + + C# Networking Deep Dive With io_uring — Part 6 — zerg + + + + + +
    + ← All posts + +
    + A dense, chaotic scattering of black numerals on a white background, overlapping at every angle. + +
    +

    C# Networking Deep Dive With io_uring — Part 6: Numbers

    + +
    + +
    +

    For part 6 let's do some benchmarks;

    + +

    These values are not "scientific", just a ballpark estimate.

    + +

    What is going to be benchmarked

    + +
      +
    • io_uring read+write with IVTS reactor inline continuations (RunAsynchrounousContinuation = false)
    • +
    • io_uring read+write without IVTS reactor inline continuations (threadpool) (RunAsynchrounousContinuation = true)
    • +
    • io_uring read + libc send write without IVTS reactor inline continuations (threadpool) (RunAsynchrounousContinuation = true)
    • +
    • epoll read+write with IVTS reactor inline continuations
    • +
    • epoll read+write without IVTS reactor inline continuations
    • +
    • System.Net.Socket (Kestrel stock) - epoll threadpool
    • +
    + +

    Tests

    + +

    (No pipelining)

    + +
      +
    • Synchronous lightweight plaintext "OK" response.
    • +
    • Asynchronous workload: _ = await Task.Run(static () => JsonSerializer.Serialize("Hello World!"));
    • +
    + +

    The purpose of the async workload is to force the continuation onto the threadpool, not to model a heavy async workload.

    + +

    Hardware

    + +

    i9 14900k
    + 64GB DDR5 6400MHz
    + Linux Kernel 6.17.0-22-generic

    + +

    Tests are done through localhost loopback (no NIC influence)
    + MTU 1500

    + +

    Load generators

    + +

    Http/1.1 no TLS

    + +

    + wrk (epoll)
    + gcannon (io_uring) +

    + +

    io_uring read+write with IVTS reactor inline continuations

    + +

    This is the exact model explored throughout the series, expected to deliver high performance on synchronous test.

    + +

    Reactor count: 12

    + +

    Sync workload

    + +
    + + + + + + + + + + + + + + +
    Metricwrkgcannon
    Latency Avg121.45us129us
    Latency Stdev178.81us
    Latency Max8.32ms
    Latency p50125us
    Latency p90185us
    Latency p99245us
    Latency p99.9317us
    Req/Sec Avg201.31k3.95M
    Requests Total18,299,27819,735,722
    Duration5.10s5.00s
    Transfer/Bandwidth225.84MB/s248.42MB/s
    + +

    Async workload (very unstable)

    + +
    + + + + + + + + + + + + + + +
    Metricwrkgcannon
    Latency Avg435.74us185us
    Latency Stdev795.84us
    Latency Max12.73ms
    Latency p50135us
    Latency p90229us
    Latency p991.84ms
    Latency p99.94.10ms
    Req/Sec Avg142.93k2.76M
    Requests Total12,883,29413,797,048
    Duration5.10s5.00s
    Transfer/Bandwidth159.05MB/s173.67MB/s
    + +

    io_uring read+write without IVTS reactor inline

    + +

    Similar model explored throughout the series but with RunAsynchronousContinuation set to true on both IVTS, expected to deliver close results on both tests.

    + +

    Reactor count: 12

    + +

    Sync workload

    + +
    + + + + + + + + + + + + + + +
    Metricwrkgcannon
    Latency Avg515.72us211us
    Latency Stdev821.99us
    Latency Max12.67ms
    Latency p50164us
    Latency p90273us
    Latency p991.55ms
    Latency p99.93.79ms
    Req/Sec Avg110.03k2.41M
    Requests Total9,946,28212,080,236
    Duration5.10s5.00s
    Transfer/Bandwidth122.80MB/s151.97MB/s
    + +

    Async workload

    + +
    + + + + + + + + + + + + + + +
    Metricwrkgcannon
    Latency Avg530.17us213us
    Latency Stdev842.05us
    Latency Max13.37ms
    Latency p50146us
    Latency p90265us
    Latency p992.27ms
    Latency p99.94.38ms
    Req/Sec Avg108.43k2.39M
    Requests Total9,726,08311,952,675
    Duration5.03s5.00s
    Transfer/Bandwidth121.82MB/s150.45MB/s
    + +

    io_uring read + libc send write without IVTS reactor inline continuations

    + +

    Similar model explored throughout the series but with RunAsynchronousContinuation set to true on both IVTS and the write branch is not io_uring, instead we use the libc's send, expected to deliver close results on both tests. This is an hybrid approach and should be the middle ground between the first two models.

    + +

    Reactor count: 12

    + +

    Sync workload

    + +
    + + + + + + + + + + + + + + +
    Metricwrkgcannon
    Latency Avg410.23us154us
    Latency Stdev782.03us
    Latency Max12.08ms
    Latency p5084us
    Latency p90176us
    Latency p992.68ms
    Latency p99.94.32ms
    Req/Sec Avg158.40k3.31M
    Requests Total14,361,23916,551,871
    Duration5.10s5.00s
    Transfer/Bandwidth0.88GB read208.27MB/s
    + +

    Async workload

    + +
    + + + + + + + + + + + + + + +
    Metricwrkgcannon
    Latency Avg418.96us159us
    Latency Stdev824.32us
    Latency Max17.51ms
    Latency p5085us
    Latency p90198us
    Latency p991.99ms
    Latency p99.94.41ms
    Req/Sec Avg154.72k3.20M
    Requests Total13,955,37115,997,491
    Duration5.09s5.00s
    Transfer/Bandwidth172.59MB/s201.18MB/s
    + +

    epoll read+write with IVTS reactor inline continuations

    + +

    Pure epoll approach with same reactor threading architecture. Inline handler continuation for both IVTS.

    + +

    Reactor count: 12

    + +

    Sync workload

    + +
    + + + + + + + + + + + + + + +
    Metricwrkgcannon
    Latency Avg284.42us160us
    Latency Stdev610.90us
    Latency Max11.06ms
    Latency p5086us
    Latency p90194us
    Latency p992.07ms
    Latency p99.94.39ms
    Req/Sec Avg188.08k3.17M
    Requests Total17,141,22515,856,691
    Duration5.10s5.00s
    Transfer/Bandwidth403.61MB/s199.56MB/s
    + +

    Async workload

    + +
    + + + + + + + + + + + + + + +
    Metricwrkgcannon
    Latency Avg458.63us159us
    Latency Stdev0.90ms
    Latency Max15.96ms
    Latency p5074us
    Latency p90185us
    Latency p992.68ms
    Latency p99.95.32ms
    Req/Sec Avg150.84k3.08M
    Requests Total13,670,69715,386,279
    Duration5.10s5.00s
    Transfer/Bandwidth322.12MB/s369.72MB/s
    + +

    epoll read+write without IVTS reactor inline continuations

    + +

    Pure epoll approach with same reactor threading architecture. Threadpool handler continuation for both IVTS.

    + +

    Reactor count: 6

    + +

    Sync workload

    + +
    + + + + + + + + + + + + + + +
    Metricwrkgcannon
    Latency Avg391.31us140us
    Latency Stdev764.42us
    Latency Max13.71ms
    Latency p5096us
    Latency p90150us
    Latency p992.06ms
    Latency p99.94.15ms
    Req/Sec Avg167.26k3.60M
    Requests Total15,179,06618,019,801
    Duration5.10s5.00s
    Transfer/Bandwidth357.60MB/s432.83MB/s
    + +

    Async workload

    + +
    + + + + + + + + + + + + + + +
    Metricwrkgcannon
    Latency Avg464.15us154us
    Latency Stdev838.78us
    Latency Max10.74ms
    Latency p5096us
    Latency p90154us
    Latency p992.22ms
    Latency p99.94.48ms
    Req/Sec Avg158.12k3.27M
    Requests Total14,231,17616,342,325
    Duration5.10s5.00s
    Transfer/Bandwidth236.89MB/s277.35MB/s
    + +

    System.Net.Socket (Kestrel stock) - epoll threadpool

    + +

    Kestrel's stock network I/O with some tunning

    + +

    Sync workload

    + +
    + + + + + + + + + + + + + + +
    Metricwrkgcannon
    Latency Avg156.79us141us
    Latency Stdev342.31us
    Latency Max6.98ms
    Latency p50129us
    Latency p90176us
    Latency p99305us
    Latency p99.93.17ms
    Req/Sec Avg174.25k3.60M
    Requests Total15,748,22318,024,579
    Duration5.10s5.00s
    Transfer/Bandwidth194.39MB/s226.84MB/s
    + +

    Async workload

    + +
    + + + + + + + + + + + + + + +
    Metricwrkgcannon
    Latency Avg255.07us169us
    Latency Stdev507.29us
    Latency Max12.53ms
    Latency p50123us
    Latency p90237us
    Latency p991.25ms
    Latency p99.93.89ms
    Req/Sec Avg150.64k3.01M
    Requests Total13,618,90615,043,820
    Duration5.10s5.00s
    Transfer/Bandwidth168.14MB/s189.25MB/s
    + +

    Comparison at a glance

    + +

    wrk and gcannon req/s and avg latency for every model, side by side.

    + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ImplementationReactorsSyncAsync
    wrk req/swrk avggcannon req/sgcannon avgwrk req/swrk avggcannon req/sgcannon avg
    io_uring r+w, IVTS inline123.59M121.45us3.95M129us2.53M*435.74us*2.76M*185us*
    io_uring r+w, threadpool121.95M515.72us2.41M211us1.93M530.17us2.39M213us
    io_uring recv + libc send122.82M410.23us3.31M154us2.74M418.96us3.20M159us
    epoll r+w, IVTS inline123.36M284.42us3.17M160us2.68M458.63us3.08M159us
    epoll r+w, threadpool62.98M391.31us3.60M140us2.79M464.15us3.27M154us
    System.Net.Socket (Kestrel stock)3.09M156.79us3.60M141us2.67M255.07us3.01M169us
    + +

    CPU usage: the inline-IVTS cases (io_uring r+w IVTS, epoll r+w IVTS) cap at around 1200% max, while every other model averages ~1600%.

    + +

    * Async run flagged as very unstable in the original write-up.

    + +

    Conclusion

    + +

    The numbers are aligned with part 5's rant. On a fully synchronous benchmark, io_uring with the reactor inline continuation rides ahead, no cross thread hand offs.

    + +

    Force the continuation on the threadpool (async workload) and that lead evaporates. The hybrid approach reclaims most of it and is a serious contender for further tests with Kestrel integration.

    + +

    A little note on the load generators, quite interesting results, gcannon seems a lot more stable on latency values while wrk is all over the place.

    + +

    Important to highlight that the reactor inline sync models consume in average 20% less CPU as they are bounded to 12 reactor CPU threads. On the other hand, solutions that allow threadpool continuation will use as much CPU is available. For example, epoll r+w IVTS inline can actually yield 3.9M rps if we increase the reactor count to 16, surpassing System.Net.Socket performance for same CPU usage.

    + +

    Very surprising result on epoll r+w threadpool, was expecting the performance to be equal to System.Net.Socket, this will be quite interesting for part 7.

    + +

    On part 7 some of these models will be integrated on Kestrel/ASP.NET for direct benchmark comparison.

    +
    +
    +
    + + + + + + + + diff --git a/docs/index.html b/docs/index.html index c1ecf6c..94e7dbd 100644 --- a/docs/index.html +++ b/docs/index.html @@ -6,17 +6,17 @@ zerg - High-Performance TCP Server Framework for C#