diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f159245..a7ab17d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -39,6 +39,9 @@ jobs: - name: Pack terraform run: dotnet pack terraform/terraform.csproj --configuration Release --no-build --output ./artifacts + + - name: Pack Twinflow + run: dotnet pack Twinflow/Twinflow.csproj --configuration Release --no-build --output ./artifacts - name: Publish to NuGet if: github.event_name == 'push' && github.ref == 'refs/heads/main' diff --git a/KestrelShrike.Demo/KestrelShrike.Demo.csproj b/KestrelShrike.Demo/KestrelShrike.Demo.csproj new file mode 100644 index 0000000..3be18a9 --- /dev/null +++ b/KestrelShrike.Demo/KestrelShrike.Demo.csproj @@ -0,0 +1,15 @@ + + + + net10.0 + enable + enable + true + true + + + + + + + diff --git a/KestrelShrike.Demo/Program.cs b/KestrelShrike.Demo/Program.cs new file mode 100644 index 0000000..0a24309 --- /dev/null +++ b/KestrelShrike.Demo/Program.cs @@ -0,0 +1,23 @@ +using Microsoft.Extensions.Logging; +using KestrelShrike; + +var builder = WebApplication.CreateBuilder(args); + +builder.Logging.SetMinimumLevel(LogLevel.Warning); // benchmark: silence per-request logs + +builder.WebHost.UseKestrel(kestrel => +{ + kestrel.ListenAnyIP(8080); +}); + +// SHRIKE=0 → Kestrel's default Socket transport (baseline). Otherwise the epoll Shrike transport. +if (Environment.GetEnvironmentVariable("SHRIKE") != "0") +{ + builder.WebHost.UseShrike(opts => opts.ReactorCount = Math.Max(1, 4)); +} + +var app = builder.Build(); + +app.MapGet("/", () => "Hello from Shrike + Kestrel\n"); + +app.Run(); diff --git a/KestrelShrike/EpollConnection.cs b/KestrelShrike/EpollConnection.cs new file mode 100644 index 0000000..6ea040d --- /dev/null +++ b/KestrelShrike/EpollConnection.cs @@ -0,0 +1,167 @@ +namespace KestrelShrike; + +/// +/// One TCP connection bridged to Kestrel through two BCL Pipes: +/// - Input: the reactor drains recv into Input.Writer; Kestrel reads Input.Reader. +/// - Output: Kestrel writes Output.Writer; the per-connection pump reads Output.Reader +/// and sends — DIRECTLY from the thread-pool thread, because an epoll +/// socket's send() is thread-safe. No reactor handoff (unlike io_uring's +/// single-issuer ring). The reactor is only involved on EAGAIN (arm +/// EPOLLOUT, signal the pump when writable). +/// Lifetime: 2-ref count (reactor/recv side + pump side); the fd closes when both end. +/// +internal sealed class EpollConnection +{ + public readonly int Fd; + public readonly int Ep; + private readonly EpollReactor _reactor; + + public readonly Pipe Input; + public readonly Pipe Output; + + private TaskCompletionSource? _writable; // set while the pump waits for EPOLLOUT + private int _refs = 2; + private int _closed; + + private const int RecvChunk = 16 * 1024; + + public EpollConnection(int fd, int ep, EpollReactor reactor) + { + Fd = fd; + Ep = ep; + _reactor = reactor; + var o = new PipeOptions(pauseWriterThreshold: 0, resumeWriterThreshold: 0, useSynchronizationContext: false); + Input = new Pipe(o); + Output = new Pipe(o); + } + + public bool IsClosed => Volatile.Read(ref _closed) != 0; + + // ---- recv (reactor thread): drain into Input.Writer. False => peer closed / error. ---- + public unsafe bool OnReadable() + { + if (IsClosed) return false; + + bool any = false; + bool ok = true; + while (true) + { + Span span = Input.Writer.GetSpan(RecvChunk); + long n; + fixed (byte* p = span) n = recv(Fd, p, (ulong)span.Length, 0); + + if (n > 0) { Input.Writer.Advance((int)n); any = true; continue; } + if (n == 0) { ok = false; break; } // peer closed + + int err = Marshal.GetLastPInvokeError(); + if (err is EAGAIN or EWOULDBLOCK) break; // drained + if (err == EINTR) continue; + ok = false; break; // hard error + } + + if (any) _ = Input.Writer.FlushAsync(); + return ok; + } + + // ---- output pump (thread pool) ---- + public async Task RunOutputPump() + { + PipeReader reader = Output.Reader; + try + { + while (true) + { + ReadResult r = await reader.ReadAsync().ConfigureAwait(false); + if (r.IsCanceled) break; + + ReadOnlySequence buf = r.Buffer; + bool fail = false; + + foreach (ReadOnlyMemory seg in buf) + { + int off = 0; + while (off < seg.Length) + { + int sent = TrySend(seg.Span.Slice(off), out bool wouldBlock, out bool closed); + if (closed) { fail = true; break; } + if (sent > 0) { off += sent; continue; } + if (wouldBlock && !await WaitWritableAsync().ConfigureAwait(false)) { fail = true; break; } + // EINTR (sent == 0, not wouldBlock, not closed) just retries + } + if (fail) break; + } + + reader.AdvanceTo(buf.End); + if (fail || r.IsCompleted) break; + } + } + catch { /* connection died mid-send */ } + finally { try { reader.Complete(); } catch { } DecRef(); } + } + + private unsafe int TrySend(ReadOnlySpan data, out bool wouldBlock, out bool closed) + { + wouldBlock = false; + closed = false; + if (data.IsEmpty) return 0; + + long n; + fixed (byte* p = data) n = send(Fd, p, data.Length, MSG_NOSIGNAL); + if (n > 0) return (int)n; + + int err = (n == 0) ? EAGAIN : Marshal.GetLastPInvokeError(); + if (err is EAGAIN or EWOULDBLOCK) { wouldBlock = true; return 0; } + if (err == EINTR) return 0; + closed = true; + return 0; + } + + // ---- EAGAIN: arm EPOLLOUT and wait for the reactor's writable signal ---- + private Task WaitWritableAsync() + { + if (IsClosed) return Task.FromResult(false); + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + Volatile.Write(ref _writable, tcs); + ArmEpollOut(); // epoll_ctl is thread-safe + if (IsClosed) tcs.TrySetResult(false); // raced with close + return tcs.Task; + } + + public void SignalWritable() // reactor: EPOLLOUT fired + { + TaskCompletionSource? tcs = Interlocked.Exchange(ref _writable, null); + if (tcs is not null) + { + ArmEpollIn(); + tcs.TrySetResult(true); + } + } + + public void MarkClosed() // reactor thread: completes Input.Writer (sole writer) + { + if (Interlocked.Exchange(ref _closed, 1) == 1) return; + try { Input.Writer.Complete(); } catch { } + Interlocked.Exchange(ref _writable, null)?.TrySetResult(false); // unblock the pump + } + + public void DecRef() + { + if (Interlocked.Decrement(ref _refs) != 0) return; + _reactor.Remove(this); + close(Fd); + } + + private unsafe void ArmEpollOut() + { + byte* ev = stackalloc byte[EvSize]; + WriteEpollEvent(ev, (uint)(EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLERR | EPOLLHUP) | EPOLLET, Fd); + epoll_ctl(Ep, EPOLL_CTL_MOD, Fd, (IntPtr)ev); + } + + private unsafe void ArmEpollIn() + { + byte* ev = stackalloc byte[EvSize]; + WriteEpollEvent(ev, (uint)(EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP) | EPOLLET, Fd); + epoll_ctl(Ep, EPOLL_CTL_MOD, Fd, (IntPtr)ev); + } +} diff --git a/KestrelShrike/EpollEngine.cs b/KestrelShrike/EpollEngine.cs new file mode 100644 index 0000000..34efcab --- /dev/null +++ b/KestrelShrike/EpollEngine.cs @@ -0,0 +1,36 @@ +using System.Threading.Channels; + +namespace KestrelShrike; + +/// Owns N epoll reactors (each with its own SO_REUSEPORT listener) and funnels accepted connections to Kestrel. +internal sealed class EpollEngine +{ + private readonly EpollReactor[] _reactors; + private readonly Channel _accepted = + Channel.CreateUnbounded(new UnboundedChannelOptions { SingleReader = false, SingleWriter = false }); + + public EpollEngine(ushort port, int reactorCount, int backlog, int maxEvents) + { + _reactors = new EpollReactor[reactorCount]; + for (int i = 0; i < reactorCount; i++) + _reactors[i] = new EpollReactor(i, port, backlog, maxEvents) { OnAccept = c => _accepted.Writer.TryWrite(c) }; + } + + public void Start() + { + for (int i = 0; i < _reactors.Length; i++) + { + int idx = i; + var t = new Thread(() => _reactors[idx].Run()) { IsBackground = true, Name = $"shrike-k-r{idx}" }; + t.Start(); + } + } + + public ValueTask AcceptAsync(CancellationToken ct) => _accepted.Reader.ReadAsync(ct); + + public void Stop() + { + _accepted.Writer.TryComplete(); + foreach (EpollReactor r in _reactors) r.Stop(); + } +} diff --git a/KestrelShrike/EpollReactor.cs b/KestrelShrike/EpollReactor.cs new file mode 100644 index 0000000..2d7a900 --- /dev/null +++ b/KestrelShrike/EpollReactor.cs @@ -0,0 +1,144 @@ +namespace KestrelShrike; + +/// +/// One reactor = one thread + one epoll instance + its own SO_REUSEPORT listener +/// (Minima/Shrike topology — kernel balances accepts, no acceptor thread). It is +/// purely a readiness driver: EPOLLIN → drain recv into the connection's input +/// Pipe (Kestrel reads it); EPOLLOUT → wake a pump that hit EAGAIN; error/hup → +/// close. Response sends happen on the pump's thread, not here. +/// +internal sealed unsafe class EpollReactor +{ + public readonly int Id; + private readonly ushort _port; + private readonly int _backlog; + private readonly int _maxEvents; + + private int _ep; + private int _listenFd; + private readonly ConcurrentDictionary _conns = new(); + private volatile bool _running = true; + + internal Action? OnAccept; + + public EpollReactor(int id, ushort port, int backlog, int maxEvents) + { + Id = id; + _port = port; + _backlog = backlog; + _maxEvents = maxEvents; + } + + public void Stop() => _running = false; + + internal void Remove(EpollConnection conn) => + _conns.TryRemove(new KeyValuePair(conn.Fd, conn)); + + public void Run() + { + _ep = epoll_create1(EPOLL_CLOEXEC); + if (_ep < 0) throw new Exception("epoll_create1 failed"); + _listenFd = OpenReusePortListener(_port, _backlog); + + byte* lev = stackalloc byte[EvSize]; + WriteEpollEvent(lev, (uint)(EPOLLIN | EPOLLERR | EPOLLHUP), _listenFd); + if (epoll_ctl(_ep, EPOLL_CTL_ADD, _listenFd, (IntPtr)lev) != 0) + throw new Exception("epoll_ctl ADD listen failed"); + + IntPtr eventsBuf = Marshal.AllocHGlobal(EvSize * _maxEvents); + Console.WriteLine($"[shrike-k r{Id}] listening on 0.0.0.0:{_port}"); + + while (_running) + { + int n = epoll_wait(_ep, eventsBuf, _maxEvents, -1); + if (n < 0) { if (Marshal.GetLastPInvokeError() == EINTR) continue; break; } + + for (int i = 0; i < n; i++) + { + ReadEpollEvent((byte*)eventsBuf + i * EvSize, out uint evs, out int fd); + + if (fd == _listenFd) { AcceptLoop(); continue; } + + if (!_conns.TryGetValue(fd, out var conn)) continue; + + if ((evs & (uint)(EPOLLERR | EPOLLHUP | EPOLLRDHUP)) != 0) + { + Close(conn); + continue; + } + + if ((evs & (uint)EPOLLIN) != 0) + { + if (!conn.OnReadable()) { Close(conn); continue; } + } + + if ((evs & (uint)EPOLLOUT) != 0) + { + conn.SignalWritable(); + } + } + } + + Marshal.FreeHGlobal(eventsBuf); + close(_listenFd); + close(_ep); + } + + private void AcceptLoop() + { + for (;;) + { + int cfd = accept4(_listenFd, IntPtr.Zero, IntPtr.Zero, SOCK_NONBLOCK | SOCK_CLOEXEC); + if (cfd >= 0) + { + int one = 1; + setsockopt(cfd, IPPROTO_TCP, TCP_NODELAY, ref one, sizeof(int)); + + byte* ev = stackalloc byte[EvSize]; + WriteEpollEvent(ev, (uint)(EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP) | EPOLLET, cfd); + epoll_ctl(_ep, EPOLL_CTL_ADD, cfd, (IntPtr)ev); + + var c = new EpollConnection(cfd, _ep, this); + _conns[cfd] = c; + OnAccept?.Invoke(c); + _ = c.RunOutputPump(); + continue; + } + int err = Marshal.GetLastPInvokeError(); + if (err == EINTR) continue; + break; // EAGAIN/EWOULDBLOCK (drained) or transient error + } + } + + private static void Close(EpollConnection conn) + { + conn.MarkClosed(); // complete Input.Writer (reactor is its sole writer) + conn.DecRef(); // reactor/recv side done + } + + private static int OpenReusePortListener(ushort port, int backlog) + { + int fd = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (fd < 0) throw new Exception($"socket failed errno={Marshal.GetLastPInvokeError()}"); + + int one = 1; + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, ref one, sizeof(int)); + setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, ref one, sizeof(int)); + + int fl = fcntl(fd, F_GETFL, 0); + if (fl >= 0) fcntl(fd, F_SETFL, fl | O_NONBLOCK); + + var addr = new sockaddr_in + { + sin_family = (ushort)AF_INET, + sin_port = Htons(port), + sin_addr = new in_addr { s_addr = 0 }, + sin_zero = new byte[8] + }; + if (bind(fd, ref addr, (uint)Marshal.SizeOf()) != 0) + throw new Exception($"bind failed errno={Marshal.GetLastPInvokeError()}"); + if (listen(fd, backlog) != 0) + throw new Exception($"listen failed errno={Marshal.GetLastPInvokeError()}"); + return fd; + } +} diff --git a/KestrelShrike/KestrelShrike.csproj b/KestrelShrike/KestrelShrike.csproj new file mode 100644 index 0000000..64b8a91 --- /dev/null +++ b/KestrelShrike/KestrelShrike.csproj @@ -0,0 +1,15 @@ + + + + net10.0 + enable + enable + true + KestrelShrike + + + + + + + diff --git a/KestrelShrike/Native.cs b/KestrelShrike/Native.cs new file mode 100644 index 0000000..151e93c --- /dev/null +++ b/KestrelShrike/Native.cs @@ -0,0 +1,297 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// (var is avoided intentionally in this project so that concrete types are visible at call sites.) +// ReSharper disable always StackAllocInsideLoop +// ReSharper disable always ClassCannotBeInstantiated +#pragma warning disable CA2014 + +namespace KestrelShrike; + +/// +/// Linux interop surface for a high-performance, epoll-driven TCP server. +/// +/// Design goals: +/// - **Minimal marshaling overhead**: prefer blittable types (e.g., pointers, ints). +/// - **Explicit error handling**: all functions are marked . +/// Use Marshal.GetLastPInvokeError() immediately after a failure to read errno. +/// - **Unsafe-friendly**: exposes pointer overloads for zero-copy recv/send. +/// +/// Platform notes: +/// - Constants can differ across libc/architectures/kernels. The values here target +/// mainstream Linux/glibc on x86_64. If you target other distros/architectures, verify +/// these values against system headers (bits/socket.h, fcntl.h, sys/epoll.h, sys/eventfd.h). +/// - Network byte order: ports must be big-endian (use htons); addresses must be set appropriately. +/// - SIGPIPE: either ignore SIGPIPE process-wide or pass to send. +/// +internal static unsafe class Native +{ + // ========================= + // P/Invoke + // ========================= + + /// + /// Create a socket. Typically domain=AF_INET, type=SOCK_STREAM, protocol=IPPROTO_TCP. + /// Returns a file descriptor (>= 0) on success, or -1 on error (check errno). + /// + [DllImport("libc", SetLastError = true)] internal static extern int socket(int domain, int type, int protocol); + + /// + /// Bind a socket to an address/port. Use for IPv4. + /// Returns 0 on success, -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int bind(int sockfd, ref sockaddr_in addr, uint addrlen); + + /// + /// Mark a bound socket as passive (accept incoming connections). + /// is the kernel queue length hint. + /// Returns 0 on success, -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int listen(int sockfd, int backlog); + + /// + /// Accept a new connection. flags can include and + /// to atomically configure the accepted FD. Returns new client FD or -1 on error. + /// Use Marshal.GetLastPInvokeError() to check for / in edge-triggered loops. + /// + [DllImport("libc", SetLastError = true)] internal static extern int accept4(int sockfd, IntPtr addr, IntPtr addrlen, int flags); + + /// + /// Set a socket option (int value). Common options: , TCP_NODELAY, etc. + /// Returns 0 on success, -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int setsockopt(int sockfd, int level, int optname, ref int optval, uint optlen); + + /// + /// Set SO_LINGER using struct. + /// Returns 0 on success, -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int setsockopt(int sockfd, int level, int optname, ref Linger optval, uint optlen); + + /// + /// File control. Typical usage: get/set O_NONBLOCK on a socket. + /// Returns result per command, or -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int fcntl(int fd, int cmd, int arg); + + /// + /// Close a file descriptor (socket or epoll/eventfd). Returns 0 on success, -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int close(int fd); + + /// + /// Read from a file descriptor into unmanaged memory. + /// For sockets, prefer . + /// Returns bytes read (>=0) or -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern long read(int fd, IntPtr buf, ulong count); + + /// + /// Write to a file descriptor from unmanaged memory. + /// For sockets, prefer . + /// Returns bytes written (>=0) or -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern long write(int fd, IntPtr buf, ulong count); + + /// + /// Receive from a socket into unmanaged memory. Returns bytes received (>=0), 0 on orderly shutdown, or -1 on error. + /// Set flags to 0 for normal reads. + /// + [DllImport("libc", SetLastError = true)] internal static extern long recv(int sockfd, IntPtr buf, ulong len, int flags); + + /// + /// Receive from a socket into a raw pointer. Equivalent to the IntPtr overload, but avoids extra pinning overhead when you already have a pointer. + /// + [DllImport("libc", SetLastError = true)] internal static extern long recv(int sockfd, byte* buf, ulong len, int flags); + + /// + /// Send to a socket from unmanaged memory. Returns bytes sent (>=0) or -1 on error. + /// Consider passing in flags to avoid SIGPIPE on closed peers. + /// + [DllImport("libc", SetLastError = true)] internal static extern long send(int sockfd, IntPtr buf, ulong len, int flags); + + /// + /// Send to a socket from a raw pointer (long length). + /// + [DllImport("libc", SetLastError = true)] internal static extern long send(int sockfd, byte* buf, long len, int flags); + + /// + /// Send to a socket from a raw void* and nuint length. + /// This signature maps closely to the native prototype and can reduce marshaling overhead in hot paths. + /// + [DllImport("libc", SetLastError = true)] public static extern nint send(int sockfd, void* buf, nuint len, int flags); + + /// + /// Create an epoll instance. Returns an epoll file descriptor (>=0) or -1 on error. + /// Use to set close-on-exec at creation time. + /// + [DllImport("libc", SetLastError = true)] internal static extern int epoll_create1(int flags); + + /// + /// Control the epoll interest list (add/mod/del). The ev points to an epoll_event struct in unmanaged memory. + /// Returns 0 on success, -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int epoll_ctl(int epfd, int op, int fd, IntPtr ev); + + /// + /// Wait for events. events points to a contiguous array of epoll_event (maxevents elements). + /// Returns number of events (>=0) or -1 on error. Use timeout < 0 to block indefinitely. + /// + [DllImport("libc", SetLastError = true)] internal static extern int epoll_wait(int epfd, IntPtr events, int maxevents, int timeout); + + /// + /// Create an eventfd (userspace semaphore/notification). Great for waking worker threads from another thread. + /// Returns fd (>=0) or -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int eventfd(uint initval, int flags); + + [DllImport("libc", SetLastError = true)] internal static extern int sched_setaffinity(int pid, IntPtr cpusetsize, ref ulong mask); + + [DllImport("libc", SetLastError = true)] internal static extern int sched_setaffinity(int pid, IntPtr cpusetsize, ref cpu_set_t mask); + + [DllImport("libc")] internal static extern int gettid(); // Linux thread id + + // ========================= + // Struct definitions + // ========================= + + /// + /// IPv4 address (network byte order). + /// + [StructLayout(LayoutKind.Sequential)] + internal struct in_addr + { + /// + /// Address in network byte order (big-endian). 0 == INADDR_ANY. + /// + public uint s_addr; + } + + /// + /// IPv4 socket address. Must be passed with addrlen = (uint)sizeof(sockaddr_in). + /// + [StructLayout(LayoutKind.Sequential)] + internal struct sockaddr_in + { + /// Address family (AF_INET). + public ushort sin_family; + + /// Port in network byte order (use htons). + public ushort sin_port; + + /// IPv4 address (use INADDR_ANY or a specific address in network byte order). + public in_addr sin_addr; + + /// + /// Padding to match native layout (8 bytes). Must be present for correct size. + /// It need not be initialized for normal usage; the kernel ignores it. + /// + [MarshalAs(UnmanagedType.ByValArray, SizeConst = 8)] + public byte[] sin_zero; + } + + /// + /// linger option for SO_LINGER. + /// If l_onoff != 0, close() will block up to l_linger seconds to flush pending data. + /// Be careful: enabling linger can cause unexpected blocking on close. + /// + [StructLayout(LayoutKind.Sequential)] + internal struct Linger + { + public int l_onoff; + public int l_linger; + } + + + // ========================= + // Constants + // ========================= + // Socket families/types/protocols + internal const int AF_INET = 2; + internal const int SOCK_STREAM = 1; + internal const int IPPROTO_TCP = 6; + + // setsockopt levels / names + internal const int SOL_SOCKET = 1; + internal const int SO_REUSEADDR = 2; + internal const int SO_REUSEPORT = 15; + internal const int SO_LINGER = 13; + /// + /// TCP_NODELAY (disable Nagle). Linux defines this at level IPPROTO_TCP with optname=1. + /// (Kept here as constant=1; use level=IPPROTO_TCP when calling setsockopt.) + /// + internal const int TCP_NODELAY = 1; + + // fcntl / file status flags + internal const int O_NONBLOCK = 0x800; // Verify per-arch. + internal const int F_GETFL = 3; + internal const int F_SETFL = 4; + + // epoll events + internal const int EPOLLIN = 0x001; + internal const int EPOLLOUT = 0x004; + internal const int EPOLLERR = 0x008; + internal const int EPOLLHUP = 0x010; + internal const int EPOLLRDHUP = 0x2000; + internal const uint EPOLLET = 0x80000000; + internal const uint EPOLLONESHOT = 0x40000000; + + // epoll_ctl ops + internal const int EPOLL_CTL_ADD = 1; + internal const int EPOLL_CTL_DEL = 2; + internal const int EPOLL_CTL_MOD = 3; + + // CLOEXEC / NONBLOCK flags (creation-time) + /// Close-on-exec for epoll_create1/eventfd. (Verify on your target kernel/arch.) + internal const int EPOLL_CLOEXEC = 0x80000; + + /// + /// On many Linux systems, SOCK_CLOEXEC is 0x1000000 (not 0x80000). + /// Validate this constant on your target platform if you pass it to socket() or accept4(). + /// + internal const int SOCK_CLOEXEC = 0x80000; + + /// Creation-time nonblocking for socket/accept4. + internal const int SOCK_NONBLOCK = 0x800; + + // eventfd flags + internal const int EFD_NONBLOCK = 0x800; + internal const int EFD_CLOEXEC = 0x80000; + + // send/recv flags + /// + /// Suppress SIGPIPE on send. Alternatively, ignore SIGPIPE process-wide. + /// + internal const int MSG_NOSIGNAL = 0x4000; + + // Common errno values we branch on in tight loops + internal const int EINTR = 4; + internal const int EAGAIN = 11; + internal const int EWOULDBLOCK = 11; + internal const int EPIPE = 32; + internal const int ECONNABORTED = 103; + internal const int ECONNRESET = 104; + + public static void PinCurrentThreadToCpu(int cpuIndex) + { + if (cpuIndex < 0 || cpuIndex >= Environment.ProcessorCount) + throw new ArgumentOutOfRangeException(nameof(cpuIndex)); + + unsafe + { + var set = new cpu_set_t(); + int word = cpuIndex / 64; + int bit = cpuIndex % 64; + set.Bits[word] = 1UL << bit; + + int tid = gettid(); + int ret = sched_setaffinity(tid, (IntPtr)sizeof(cpu_set_t), ref set); + if (ret != 0) + throw new InvalidOperationException($"sched_setaffinity failed with errno {Marshal.GetLastPInvokeError()}"); + } + } +} + +internal unsafe struct cpu_set_t +{ + public fixed ulong Bits[16]; // 1024 bits (enough for up to 1024 CPUs) +} \ No newline at end of file diff --git a/KestrelShrike/ProcessorArchDependant.cs b/KestrelShrike/ProcessorArchDependant.cs new file mode 100644 index 0000000..607e153 --- /dev/null +++ b/KestrelShrike/ProcessorArchDependant.cs @@ -0,0 +1,149 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// (var is avoided intentionally in this project so that concrete types are visible at call sites.) +// ReSharper disable always StackAllocInsideLoop +// ReSharper disable always ClassCannotBeInstantiated +#pragma warning disable CA2014 + +namespace KestrelShrike; + +/// +/// Provides architecture-dependent helpers for low-level socket and epoll interop. +/// +/// +/// Linux’s struct epoll_event has different binary layouts depending on CPU architecture +/// (notably 12 bytes on x86/x64 and 16 bytes on most other architectures like ARM/ARM64). +/// This class exposes constants and helpers to correctly read and write those structures +/// at runtime based on the process architecture. +/// +/// +/// +/// These methods are used to serialize and deserialize epoll_event structures directly +/// into unmanaged buffers when interfacing with epoll_wait, epoll_ctl, and related syscalls. +/// +/// +internal static unsafe class ProcessorArchDependant +{ + // ============================================================================================= + // Architecture-dependent configuration + // ============================================================================================= + + /// + /// Indicates whether the current platform uses a packed epoll_event layout (12 bytes). + /// + /// On x86 and x64 (little-endian), the epoll_event structure is packed to 12 bytes. + /// On ARM, ARM64, and others, it uses natural 8-byte alignment, resulting in 16 bytes. + /// + /// + internal static readonly bool Packed = + RuntimeInformation.ProcessArchitecture == Architecture.X64 || + RuntimeInformation.ProcessArchitecture == Architecture.X86; + + /// + /// The size (in bytes) of an epoll_event structure for the current runtime architecture. + /// + /// Typically 12 bytes for packed x86/x64 layouts and 16 for natural alignment layouts. + /// + /// + internal static readonly int EvSize = Packed ? 12 : 16; + + // ============================================================================================= + // Struct read/write helpers + // ============================================================================================= + + /// + /// Writes a Linux epoll_event structure into a preallocated unmanaged memory region. + /// + /// Destination pointer to write the structure into. + /// Bitmask of epoll events (e.g. EPOLLIN, EPOLLOUT, EPOLLRDHUP, etc.). + /// The file descriptor associated with the event. + /// + /// + /// Layouts by architecture: + /// + /// Packed (x86/x64): events @ 0 (4 bytes), data @ 4 (8 bytes) + /// Natural (ARM/others): events @ 0 (4 bytes), padding 4, data @ 8 (8 bytes) + /// + /// + /// Only the lower 32 bits of are stored in the data field. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void WriteEpollEvent(void* dest, uint events, int fd) + { + if (Packed) + { + // events @0 (4 bytes), data @4 (8 bytes) + *(uint*)dest = events; + *(ulong*)((byte*)dest + 4) = (uint)fd; // store fd in low 32 bits + } + else + { + // events @0 (4 bytes), pad 4, data @8 (8 bytes) + *(uint*)dest = events; + *(ulong*)((byte*)dest + 8) = (uint)fd; + } + } + + /// + /// Reads a Linux epoll_event structure from unmanaged memory and extracts its fields. + /// + /// Pointer to the source buffer containing the epoll_event structure. + /// Outputs the event flags (EPOLLIN, EPOLLOUT, etc.). + /// Outputs the associated file descriptor. + /// + /// Reads using the correct layout depending on the flag. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void ReadEpollEvent(void* src, out uint events, out int fd) + { + if (Packed) + { + events = *(uint*)src; + fd = (int)*(uint*)((byte*)src + 4); + } + else + { + events = *(uint*)src; + fd = (int)*(uint*)((byte*)src + 8); + } + } + + // Variations, TODO: Test performance required + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] + internal static void WriteEpollEvent2(void* dest, uint events, int fd) + { + // Write events (always aligned 4B store) + *(uint*)dest = events; + // Compute data offset (packed: +4, natural: +8) + var data = (byte*)dest + (Packed ? 4 : 8); + // Store only low 32 bits of fd and zero the high 32 bits. + // Using two 4B stores avoids an unaligned 8B write in the packed layout. + *(uint*)data = (uint)fd; // low 32 + *(uint*)(data + 4) = 0; // high 32 + } + + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] + internal static void ReadEpollEvent2(void* src, out uint events, out int fd) + { + events = *(uint*)src; + var data = (byte*)src + (Packed ? 4 : 8); + // We only ever wrote the low 32 bits; read exactly those. + fd = (int)*(uint*)data; + } + + // ============================================================================================= + // Networking helpers + // ============================================================================================= + + /// + /// Converts a 16-bit unsigned integer from host byte order to network byte order (big-endian). + /// + /// The value to convert. + /// The converted value in network byte order. + /// + /// Equivalent to the native htons() function from the BSD sockets API. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ushort Htons(ushort x) => + BitConverter.IsLittleEndian ? BinaryPrimitives.ReverseEndianness(x) : x; +} \ No newline at end of file diff --git a/KestrelShrike/ShrikeKestrel.cs b/KestrelShrike/ShrikeKestrel.cs new file mode 100644 index 0000000..c5818cb --- /dev/null +++ b/KestrelShrike/ShrikeKestrel.cs @@ -0,0 +1,151 @@ +using System.Net; +using System.Threading.Channels; +using Microsoft.AspNetCore.Connections; +using Microsoft.AspNetCore.Connections.Features; +using Microsoft.AspNetCore.Hosting; +using Microsoft.AspNetCore.Http.Features; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; + +namespace KestrelShrike; + +internal sealed class DuplexPipe : IDuplexPipe +{ + public DuplexPipe(PipeReader input, PipeWriter output) { Input = input; Output = output; } + public PipeReader Input { get; } + public PipeWriter Output { get; } +} + +internal sealed class ShrikeConnectionContext : ConnectionContext, + IConnectionIdFeature, IConnectionTransportFeature, IConnectionItemsFeature, + IConnectionLifetimeFeature, IConnectionEndPointFeature +{ + private static long s_id; + + private readonly EpollConnection _conn; + private readonly IDuplexPipe _transport; + private readonly CancellationTokenSource _closedCts = new(); + private readonly FeatureCollection _features = new(); + private bool _disposed; + + public ShrikeConnectionContext(EpollConnection conn, EndPoint? localEndPoint) + { + _conn = conn; + _transport = new DuplexPipe(conn.Input.Reader, conn.Output.Writer); + + ConnectionId = $"shrike-{Interlocked.Increment(ref s_id):x}"; + LocalEndPoint = localEndPoint; + Items = new ConnectionItems(); + ConnectionClosed = _closedCts.Token; + + _features.Set(this); + _features.Set(this); + _features.Set(this); + _features.Set(this); + _features.Set(this); + } + + public override string ConnectionId { get; set; } + public override IFeatureCollection Features => _features; + public override IDictionary Items { get; set; } + public override IDuplexPipe Transport + { + get => _transport; + set => throw new NotSupportedException("Transport is owned by the Shrike transport."); + } + public override CancellationToken ConnectionClosed { get; set; } + public override EndPoint? LocalEndPoint { get; set; } + public override EndPoint? RemoteEndPoint { get; set; } + + public override void Abort(ConnectionAbortedException abortReason) + { + try { _closedCts.Cancel(); } catch { } + try { _conn.Output.Reader.CancelPendingRead(); } catch { } // wake the pump + try { _conn.Output.Writer.Complete(abortReason); } catch { } // pump exits + } + + public override ValueTask DisposeAsync() + { + if (_disposed) return ValueTask.CompletedTask; + _disposed = true; + try { _closedCts.Cancel(); } catch { } + try { _conn.Input.Reader.Complete(); } catch { } // Kestrel done reading + try { _conn.Output.Writer.Complete(); } catch { } // pump exits + _closedCts.Dispose(); + return ValueTask.CompletedTask; + } +} + +internal sealed class ShrikeConnectionListener : IConnectionListener +{ + private readonly EpollEngine _engine; + + public ShrikeConnectionListener(EpollEngine engine, EndPoint endpoint) + { + _engine = engine; + EndPoint = endpoint; + } + + public EndPoint EndPoint { get; } + + public async ValueTask AcceptAsync(CancellationToken cancellationToken = default) + { + try + { + EpollConnection conn = await _engine.AcceptAsync(cancellationToken).ConfigureAwait(false); + return new ShrikeConnectionContext(conn, EndPoint); + } + catch (OperationCanceledException) { return null; } + catch (ChannelClosedException) { return null; } + } + + public ValueTask UnbindAsync(CancellationToken cancellationToken = default) { _engine.Stop(); return ValueTask.CompletedTask; } + public ValueTask DisposeAsync() { _engine.Stop(); return ValueTask.CompletedTask; } +} + +public sealed class ShrikeTransportOptions +{ + public int ReactorCount { get; set; } = Math.Max(1, Environment.ProcessorCount); + public int Backlog { get; set; } = 16384; + public int MaxEventsPerWake { get; set; } = 512; +} + +public sealed class ShrikeTransportFactory : IConnectionListenerFactory +{ + private readonly ShrikeTransportOptions _options; + private readonly ILogger _logger; + + public ShrikeTransportFactory(IOptions options, ILoggerFactory loggerFactory) + { + _options = options.Value; + _logger = loggerFactory.CreateLogger(); + } + + public ValueTask BindAsync(EndPoint endpoint, CancellationToken cancellationToken = default) + { + if (endpoint is not IPEndPoint ip) + throw new NotSupportedException($"Shrike only supports {nameof(IPEndPoint)} (got {endpoint.GetType().Name})."); + + var engine = new EpollEngine((ushort)ip.Port, _options.ReactorCount, _options.Backlog, _options.MaxEventsPerWake); + engine.Start(); + _logger.LogInformation("[shrike-k] Bound :{Port} with {ReactorCount} epoll reactor(s) (SO_REUSEPORT)", ip.Port, _options.ReactorCount); + + IConnectionListener listener = new ShrikeConnectionListener(engine, ip); + return ValueTask.FromResult(listener); + } +} + +public static class ShrikeKestrelExtensions +{ + /// Replace Kestrel's socket transport with the epoll-based Shrike transport. + public static IWebHostBuilder UseShrike(this IWebHostBuilder builder, Action? configure = null) + { + builder.ConfigureServices(services => + { + if (configure is not null) services.Configure(configure); + services.AddSingleton(); + }); + return builder; + } +} diff --git a/KestrelShrike/_usings.cs b/KestrelShrike/_usings.cs new file mode 100644 index 0000000..7ad53a5 --- /dev/null +++ b/KestrelShrike/_usings.cs @@ -0,0 +1,9 @@ +global using System; +global using System.Buffers; +global using System.Buffers.Binary; +global using System.Collections.Concurrent; +global using System.IO.Pipelines; +global using System.Runtime.CompilerServices; +global using System.Runtime.InteropServices; +global using static KestrelShrike.ProcessorArchDependant; +global using static KestrelShrike.Native; diff --git a/KestrelZerg.Demo/Program.cs b/KestrelZerg.Demo/Program.cs index f81f538..8d11ae8 100644 --- a/KestrelZerg.Demo/Program.cs +++ b/KestrelZerg.Demo/Program.cs @@ -8,7 +8,7 @@ }) .UseZerg(opts => { - opts.ReactorCount = Math.Max(1, 12); + opts.ReactorCount = Math.Max(1, 32); }); var app = builder.Build(); diff --git a/Kite.Demo/Kite.Demo.csproj b/Kite.Demo/Kite.Demo.csproj new file mode 100644 index 0000000..898e902 --- /dev/null +++ b/Kite.Demo/Kite.Demo.csproj @@ -0,0 +1,16 @@ + + + + net10.0 + enable + enable + true + false + true + + + + + + + diff --git a/Kite.Demo/Program.cs b/Kite.Demo/Program.cs new file mode 100644 index 0000000..0e8fade --- /dev/null +++ b/Kite.Demo/Program.cs @@ -0,0 +1,10 @@ +using Kite; + +var builder = WebApplication.CreateBuilder(args); +builder.Logging.SetMinimumLevel(LogLevel.Warning); +builder.WebHost.UseKite(o => o.ReactorCount = 8); +builder.WebHost.ConfigureKestrel(o => o.ListenAnyIP(8080)); + +var app = builder.Build(); +app.MapGet("/", () => "Hello from Kite + Kestrel"); +app.Run(); diff --git a/Kite/Kite.csproj b/Kite/Kite.csproj new file mode 100644 index 0000000..4491f6f --- /dev/null +++ b/Kite/Kite.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + enable + disable + true + + + + + + + + + + + diff --git a/Kite/KiteConnection.cs b/Kite/KiteConnection.cs new file mode 100644 index 0000000..f60ce73 --- /dev/null +++ b/Kite/KiteConnection.cs @@ -0,0 +1,128 @@ +namespace Kite; + +/// +/// Lean connection — KestrelShrike's design (dual BCL Pipes + a libc-send pump), +/// but recv is driven by an io_uring reactor instead of epoll. NO Minima IVTS, +/// NO write slab, NO SPSC. The reactor copies recv bytes into Input; Kestrel reads +/// Input and writes Output; the pump drains Output and sends via libc send(). +/// +internal sealed class KiteConnection +{ + public readonly int Fd; + public readonly long Id; + private readonly KiteReactor _reactor; + + public readonly Pipe Input; + public readonly Pipe Output; + + private int _refs = 2; // reactor (recv) side + pump side + private int _closed; + private static long s_id; + + public KiteConnection(int fd, KiteReactor reactor) + { + Fd = fd; + _reactor = reactor; + Id = Interlocked.Increment(ref s_id); + var o = new PipeOptions(pauseWriterThreshold: 0, resumeWriterThreshold: 0, useSynchronizationContext: false); + Input = new Pipe(o); + Output = new Pipe(o); + } + + public bool IsClosed => Volatile.Read(ref _closed) != 0; + + /// Reactor thread: copy recv bytes (from the provided buffer) into Input. + public unsafe void OnRecv(byte* ptr, int len) + { + Span dst = Input.Writer.GetSpan(len); + new ReadOnlySpan(ptr, len).CopyTo(dst); + Input.Writer.Advance(len); + _ = Input.Writer.FlushAsync(); // schedules Kestrel's read on the thread pool + } + + /// Output pump (thread pool): drain Output and send via libc send(), like KestrelShrike. + public async Task RunOutputPump() + { + PipeReader reader = Output.Reader; + try + { + while (true) + { + ReadResult r = await reader.ReadAsync().ConfigureAwait(false); + if (r.IsCanceled) break; + + ReadOnlySequence buf = r.Buffer; + bool fail = false; + + foreach (ReadOnlyMemory seg in buf) + { + int off = 0; + while (off < seg.Length) + { + int sent = TrySend(seg.Span.Slice(off), out bool wouldBlock, out bool closed); + if (closed) { fail = true; break; } + if (sent > 0) { off += sent; continue; } + if (wouldBlock) + { + // EAGAIN is ~never hit for small responses. Yield+retry instead of + // an io_uring POLLOUT round-trip (fine for the benchmark experiment). + if (IsClosed) { fail = true; break; } + await Task.Yield(); + } + } + if (fail) break; + } + + reader.AdvanceTo(buf.End); + if (fail || r.IsCompleted) break; + } + } + catch { } + finally { try { reader.Complete(); } catch { } DecRef(); } + } + + private unsafe int TrySend(ReadOnlySpan data, out bool wouldBlock, out bool closed) + { + wouldBlock = false; + closed = false; + if (data.IsEmpty) return 0; + + long n; + fixed (byte* p = data) n = send(Fd, p, (nuint)data.Length, MSG_NOSIGNAL); + if (n > 0) return (int)n; + + int err = (n == 0) ? EAGAIN : Marshal.GetLastPInvokeError(); + if (err == EAGAIN || err == EWOULDBLOCK) { wouldBlock = true; return 0; } + if (err == 4 /* EINTR */) return 0; + closed = true; + return 0; + } + + public void MarkClosed() + { + if (Interlocked.Exchange(ref _closed, 1) == 1) return; + try { Input.Writer.Complete(); } catch { } + } + + public void DecRef() + { + if (Interlocked.Decrement(ref _refs) != 0) return; + _reactor.Remove(this); + close(Fd); + } + + /// + /// Kestrel finished/aborted the connection. Complete the pipes (pump exits) and + /// shutdown() the socket so the reactor's outstanding multishot recv completes + /// (res <= 0) and releases its ref. The fd is closed once both refs drop to 0. + /// + public void OnKestrelClose() + { + MarkClosed(); + try { Output.Writer.Complete(); } catch { } + shutdown(Fd, SHUT_RDWR); + } + + [DllImport("libc")] private static extern int shutdown(int fd, int how); + private const int SHUT_RDWR = 2; +} diff --git a/Kite/KiteEngine.cs b/Kite/KiteEngine.cs new file mode 100644 index 0000000..84c2a8e --- /dev/null +++ b/Kite/KiteEngine.cs @@ -0,0 +1,48 @@ +namespace Kite; + +/// N io_uring reactors (SO_REUSEPORT) feeding accepted connections to Kestrel via a channel. +public sealed class KiteEngine +{ + private readonly List _reactors = new(); + private readonly List _threads = new(); + private readonly Channel _accepted = + Channel.CreateUnbounded(new UnboundedChannelOptions { SingleReader = false, SingleWriter = false }); + + private readonly ushort _port; + private readonly int _reactorCount; + private readonly int _recvBufferSize; + private readonly int _bufferRingEntries; + private readonly uint _ringEntries; + + internal KiteEngine(ushort port, int reactorCount, + uint ringEntries = 8192, int recvBufferSize = 16 * 1024, int bufferRingEntries = 4096) + { + _port = port; + _reactorCount = reactorCount; + _ringEntries = ringEntries; + _recvBufferSize = recvBufferSize; + _bufferRingEntries = bufferRingEntries; + } + + internal void Start() + { + for (int i = 0; i < _reactorCount; i++) + { + var r = new KiteReactor(i, _port, _ringEntries, _recvBufferSize, _bufferRingEntries) { OnAccept = OnReactorAccept }; + _reactors.Add(r); + var t = new Thread(r.Run) { IsBackground = true, Name = $"kite-r{i}" }; + _threads.Add(t); + t.Start(); + } + } + + private void OnReactorAccept(KiteConnection conn) => _accepted.Writer.TryWrite(conn); + + internal ValueTask AcceptAsync(CancellationToken ct) => _accepted.Reader.ReadAsync(ct); + + internal void Stop() + { + foreach (var r in _reactors) r.Stop(); + _accepted.Writer.TryComplete(); + } +} diff --git a/Kite/KiteKestrel.cs b/Kite/KiteKestrel.cs new file mode 100644 index 0000000..543f60a --- /dev/null +++ b/Kite/KiteKestrel.cs @@ -0,0 +1,142 @@ +using System.Net; +using Microsoft.AspNetCore.Connections; +using Microsoft.AspNetCore.Connections.Features; +using Microsoft.AspNetCore.Hosting; +using Microsoft.AspNetCore.Http.Features; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; + +namespace Kite; + +internal sealed class KiteDuplexPipe : IDuplexPipe +{ + public KiteDuplexPipe(KiteConnection conn) + { + Input = conn.Input.Reader; + Output = conn.Output.Writer; + } + public PipeReader Input { get; } + public PipeWriter Output { get; } +} + +internal sealed class KiteConnectionContext : ConnectionContext, + IConnectionIdFeature, IConnectionTransportFeature, IConnectionItemsFeature, + IConnectionLifetimeFeature, IConnectionEndPointFeature +{ + private readonly KiteConnection _conn; + private readonly KiteDuplexPipe _pipe; + private readonly CancellationTokenSource _closedCts = new(); + private readonly FeatureCollection _features = new(); + private bool _disposed; + + public KiteConnectionContext(KiteConnection conn, EndPoint? localEndPoint) + { + _conn = conn; + _pipe = new KiteDuplexPipe(conn); + + ConnectionId = $"kite-{conn.Id:x}"; + LocalEndPoint = localEndPoint; + Items = new ConnectionItems(); + ConnectionClosed = _closedCts.Token; + + _features.Set(this); + _features.Set(this); + _features.Set(this); + _features.Set(this); + _features.Set(this); + } + + public override string ConnectionId { get; set; } + public override IFeatureCollection Features => _features; + public override IDictionary Items { get; set; } + public override IDuplexPipe Transport + { + get => _pipe; + set => throw new NotSupportedException("Transport is owned by the Kite transport."); + } + public override CancellationToken ConnectionClosed { get; set; } + public override EndPoint? LocalEndPoint { get; set; } + public override EndPoint? RemoteEndPoint { get; set; } + + public override void Abort(ConnectionAbortedException abortReason) + { + try { _closedCts.Cancel(); } catch { } + _conn.OnKestrelClose(); + } + + public override ValueTask DisposeAsync() + { + if (_disposed) return ValueTask.CompletedTask; + _disposed = true; + try { _closedCts.Cancel(); } catch { } + _conn.OnKestrelClose(); + _closedCts.Dispose(); + return ValueTask.CompletedTask; + } +} + +internal sealed class KiteConnectionListener : IConnectionListener +{ + private readonly KiteEngine _engine; + public KiteConnectionListener(KiteEngine engine, EndPoint endpoint) { _engine = engine; EndPoint = endpoint; } + public EndPoint EndPoint { get; } + + public async ValueTask AcceptAsync(CancellationToken cancellationToken = default) + { + try + { + KiteConnection conn = await _engine.AcceptAsync(cancellationToken).ConfigureAwait(false); + return new KiteConnectionContext(conn, EndPoint); + } + catch (OperationCanceledException) { return null; } + catch (ChannelClosedException) { return null; } + } + + public ValueTask UnbindAsync(CancellationToken cancellationToken = default) { _engine.Stop(); return ValueTask.CompletedTask; } + public ValueTask DisposeAsync() { _engine.Stop(); return ValueTask.CompletedTask; } +} + +public sealed class KiteTransportOptions +{ + public int ReactorCount { get; set; } = Math.Max(1, Environment.ProcessorCount); +} + +public sealed class KiteTransportFactory : IConnectionListenerFactory +{ + private readonly KiteTransportOptions _options; + private readonly ILogger _logger; + + public KiteTransportFactory(IOptions options, ILoggerFactory loggerFactory) + { + _options = options.Value; + _logger = loggerFactory.CreateLogger(); + } + + public ValueTask BindAsync(EndPoint endpoint, CancellationToken cancellationToken = default) + { + if (endpoint is not IPEndPoint ip) + throw new NotSupportedException($"Kite only supports {nameof(IPEndPoint)} (got {endpoint.GetType().Name})."); + + var engine = new KiteEngine((ushort)ip.Port, _options.ReactorCount); + engine.Start(); + _logger.LogInformation("[kite] Bound :{Port} with {ReactorCount} io_uring reactor(s) (lean, libc send)", ip.Port, _options.ReactorCount); + + IConnectionListener listener = new KiteConnectionListener(engine, ip); + return ValueTask.FromResult(listener); + } +} + +public static class KiteKestrelExtensions +{ + /// Lean io_uring recv loop + libc send pump — KestrelShrike's design with io_uring instead of epoll. + public static IWebHostBuilder UseKite(this IWebHostBuilder builder, Action? configure = null) + { + builder.ConfigureServices(services => + { + if (configure is not null) services.Configure(configure); + services.AddSingleton(); + }); + return builder; + } +} diff --git a/Kite/KiteReactor.cs b/Kite/KiteReactor.cs new file mode 100644 index 0000000..2f09a73 --- /dev/null +++ b/Kite/KiteReactor.cs @@ -0,0 +1,253 @@ +namespace Kite; + +/// +/// Lean io_uring reactor: one thread + one ring + one SO_REUSEPORT listener + +/// a shared provided-buffer ring. Multishot accept + multishot recv; recv bytes +/// are copied into the connection's Input pipe. The ring does NOT do the send — +/// the response goes out via libc send() in the connection's pump, exactly like +/// KestrelShrike. This isolates "io_uring recv loop" vs "epoll recv loop" with +/// everything else (lean connection, dual pipes, libc send) held identical. +/// +internal sealed unsafe class KiteReactor +{ + public readonly int Id; + private readonly ushort _port; + private readonly uint _ringEntries; + private readonly uint _recvBufferSize; + private readonly uint _bufferRingEntries; + + private Ring _ring = null!; + private int _listenFd; + private readonly ConcurrentDictionary _conns = new(); + private volatile bool _running = true; + + internal Action? OnAccept; + + // CQE user_data: kind tag in the high 32 bits, fd in the low 32. + private const ulong KindAccept = 1UL << 32; + private const ulong KindRecv = 2UL << 32; + private const ushort BgId = 1; + + // Shared provided-buffer ring (one per reactor). + private byte* _bufRing; + private byte* _bufSlab; + private uint _bufRingMask; + private ushort _bufRingTail; + + private const int EINTR = 4; + private const int EAGAIN = 11; + private const int EBUSY = 16; + + public KiteReactor(int id, ushort port, uint ringEntries, int recvBufferSize, int bufferRingEntries) + { + Id = id; + _port = port; + _ringEntries = ringEntries; + _recvBufferSize = (uint)recvBufferSize; + _bufferRingEntries = (uint)bufferRingEntries; + } + + public void Stop() => _running = false; + + internal void Remove(KiteConnection conn) + => _conns.TryRemove(new KeyValuePair(conn.Fd, conn)); + + public void Run() + { + _ring = Ring.Create(_ringEntries); + _listenFd = OpenReusePortListener(_port); + InitBufferRing(); + + Console.WriteLine($"[kite r{Id}] listening on 0.0.0.0:{_port}"); + SubmitAcceptMultishot(); + + while (_running) + { + int rc = _ring.SubmitAndWait(1); + if (rc < 0 && rc != -EINTR && rc != -EAGAIN && rc != -EBUSY) + { + Console.Error.WriteLine($"[kite r{Id}] io_uring_enter failed: {rc}"); + break; + } + + uint ready = _ring.CqReady(); + for (uint i = 0; i < ready; i++) + { + Dispatch(in _ring.CqeAt(i)); + } + _ring.CqAdvance(ready); + } + + close(_listenFd); + _ring.Dispose(); + } + + private void Dispatch(in IoUringCqe cqe) + { + ulong kind = cqe.user_data & 0xffffffff_00000000UL; + int fd = (int)(cqe.user_data & 0xffffffffUL); + bool more = (cqe.flags & IORING_CQE_F_MORE) != 0; + + if (kind == KindAccept) + { + if (cqe.res >= 0) + { + int clientFd = cqe.res; + SetNoDelay(clientFd); + var conn = new KiteConnection(clientFd, this); + _conns[clientFd] = conn; + SubmitRecvMultishot(clientFd); + OnAccept?.Invoke(conn); // vend to the Kestrel transport + _ = conn.RunOutputPump(); // start the libc-send pump (thread pool) + } + else + { + Console.Error.WriteLine($"[kite r{Id}] accept error: {cqe.res}"); + } + if (!more) SubmitAcceptMultishot(); + return; + } + + if (kind == KindRecv) + { + bool hasBuf = (cqe.flags & IORING_CQE_F_BUFFER) != 0; + ushort bid = hasBuf ? (ushort)(cqe.flags >> IORING_CQE_BUFFER_SHIFT) : (ushort)0; + + if (cqe.res <= 0) + { + // Peer EOF / recv error / cancel (e.g. shutdown from Kestrel close). + if (hasBuf) ReturnBufferDirect(bid); + if (_conns.TryRemove(fd, out var dying)) + { + dying.MarkClosed(); + dying.DecRef(); // release the reactor's recv-side ref + } + return; + } + + if (hasBuf && _conns.TryGetValue(fd, out var conn)) + { + conn.OnRecv(_bufSlab + (nuint)bid * (nuint)_recvBufferSize, cqe.res); + ReturnBufferDirect(bid); + } + else if (hasBuf) + { + ReturnBufferDirect(bid); + } + + if (!more) SubmitRecvMultishot(fd); + } + } + + // ========================================================================= + // Provided-buffer ring (copied from Spring's reactor) + // ========================================================================= + + private void InitBufferRing() + { + nuint ringBytes = (nuint)_bufferRingEntries * 16; + _bufRing = (byte*)NativeMemory.AlignedAlloc(ringBytes, 4096); + NativeMemory.Clear(_bufRing, ringBytes); + + nuint slabBytes = _bufferRingEntries * (nuint)_recvBufferSize; + _bufSlab = (byte*)NativeMemory.AlignedAlloc(slabBytes, 64); + + _bufRingMask = _bufferRingEntries - 1; + + var reg = new io_uring_buf_reg + { + ring_addr = (ulong)_bufRing, + ring_entries = _bufferRingEntries, + bgid = BgId, + }; + + int ret = io_uring_register(_ring.Fd, IORING_REGISTER_PBUF_RING, ®, 1); + if (ret < 0) + { + int err = Marshal.GetLastPInvokeError(); + throw new InvalidOperationException($"register pbuf_ring failed: ret={ret} errno={err}"); + } + + for (ushort bid = 0; bid < _bufferRingEntries; bid++) + { + byte* slot = _bufRing + (uint)bid * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)_recvBufferSize); + *(uint*)(slot + 8) = _recvBufferSize; + *(ushort*)(slot + 12) = bid; + } + _bufRingTail = (ushort)_bufferRingEntries; + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + + private void ReturnBufferDirect(ushort bid) + { + byte* slot = _bufRing + (_bufRingTail & _bufRingMask) * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)_recvBufferSize); + *(uint*)(slot + 8) = _recvBufferSize; + *(ushort*)(slot + 12) = bid; + _bufRingTail++; + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + + // ========================================================================= + // SQE producers (reactor-thread only) + // ========================================================================= + + private IoUringSqe* GetSqeOrFlush() + { + IoUringSqe* sqe = _ring.GetSqe(); + if (sqe != null) return sqe; + + _ring.SubmitAndWait(0); + sqe = _ring.GetSqe(); + if (sqe == null) throw new InvalidOperationException("SQ full after flush"); + return sqe; + } + + private void SubmitAcceptMultishot() + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_ACCEPT; + sqe->ioprio = IORING_ACCEPT_MULTISHOT; + sqe->fd = _listenFd; + sqe->user_data = KindAccept | (uint)_listenFd; + } + + private void SubmitRecvMultishot(int fd) + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_RECV; + sqe->flags = IOSQE_BUFFER_SELECT; + sqe->ioprio = IORING_RECV_MULTISHOT; + sqe->fd = fd; + sqe->buf_index = BgId; + sqe->user_data = KindRecv | (uint)fd; + } + + private static void SetNoDelay(int fd) + { + int one = 1; + setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(int)); + } + + private static int OpenReusePortListener(ushort port) + { + int fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) throw new InvalidOperationException($"socket failed: {fd}"); + + int one = 1; + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(int)); + setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(int)); + + sockaddr_in addr = default; + addr.sin_family = AF_INET; + addr.sin_port = Htons(port); + addr.sin_addr.s_addr = 0; // 0.0.0.0 + + if (bind(fd, &addr, (uint)sizeof(sockaddr_in)) < 0) throw new InvalidOperationException("bind failed"); + if (listen(fd, 128) < 0) throw new InvalidOperationException("listen failed"); + return fd; + } +} diff --git a/Kite/_usings.cs b/Kite/_usings.cs new file mode 100644 index 0000000..3b85a30 --- /dev/null +++ b/Kite/_usings.cs @@ -0,0 +1,12 @@ +global using System; +global using System.Buffers; +global using System.Collections.Concurrent; +global using System.Collections.Generic; +global using System.IO.Pipelines; +global using System.Runtime.CompilerServices; +global using System.Runtime.InteropServices; +global using System.Threading; +global using System.Threading.Channels; +global using System.Threading.Tasks; +global using Spring; // reuse Spring's io_uring Ring +global using static Spring.Native; // reuse Spring's io_uring + libc bindings/constants diff --git a/Minima/Connection.cs b/Minima/Connection.cs deleted file mode 100644 index a9666a6..0000000 --- a/Minima/Connection.cs +++ /dev/null @@ -1,128 +0,0 @@ -using System.Threading.Tasks.Sources; -using static Minima.Native; - -namespace Minima; - -internal readonly struct RecvSnapshot -{ - public readonly long Tail; - public readonly bool IsClosed; - - public RecvSnapshot(long tail, bool isClosed) - { - Tail = tail; - IsClosed = isClosed; - } - - public static RecvSnapshot Closed() => new(0, isClosed: true); -} - -internal sealed unsafe class Connection : IValueTaskSource -{ - private readonly Reactor _reactor; - - public Connection(Reactor reactor) - { - _reactor = reactor; - } - - private ManualResetValueTaskSourceCore _readSignal; - private int _armed; - private int _closed; - - private readonly SpscRecvRing _recv = new(capacityPow2: 16); - - public ValueTask ReadAsync() - { - if (!_recv.IsEmpty()) - { - return new ValueTask(new RecvSnapshot(_recv.SnapshotTail(), _closed != 0)); - } - - if (_closed != 0) - { - return new ValueTask(RecvSnapshot.Closed()); - } - - if (_armed == 1) - { - throw new InvalidOperationException("ReadAsync already armed."); - } - - _armed = 1; - - return new ValueTask(this, _readSignal.Version); - } - - public bool TryGetItem(in RecvSnapshot snap, out SpscRecvRing.Item item) - => _recv.TryDequeueUntil(snap.Tail, out item); - - public void ResetRead() => _readSignal.Reset(); - - public void Complete(int res, ushort bid, bool hasBuffer, byte* ptr) - { - if (res <= 0) - { - _closed = 1; - if (hasBuffer) - { - _reactor.ReturnBuffer(bid); - } - } - else if (!_recv.TryEnqueue(new SpscRecvRing.Item - { - Ptr = ptr, - Bid = bid, - Len = res, - HasBuffer = hasBuffer - })) - { - Console.Error.WriteLine("[conn] recv queue overflow; closing."); - if (hasBuffer) - { - _reactor.ReturnBuffer(bid); - } - _closed = 1; - } - - if (_armed == 1) - { - _armed = 0; - _readSignal.SetResult(new RecvSnapshot(_recv.SnapshotTail(), _closed != 0)); - } - } - - public void MarkClosed() - { - _closed = 1; - - if (_armed == 1) - { - _armed = 0; - _readSignal.SetResult(new RecvSnapshot(_recv.SnapshotTail(), isClosed: true)); - } - } - - public void QueueResponse(int fd) => _reactor.SubmitSend(fd, Program.s_responseBytes, (uint)Program.s_responseLen); - - public void Close(int fd) - { - while (_recv.TryDequeue(out SpscRecvRing.Item item)) - { - if (item.HasBuffer) - { - _reactor.ReturnBuffer(item.Bid); - } - } - - _reactor.Connections.Remove(fd); - close(fd); - } - - RecvSnapshot IValueTaskSource.GetResult(short token) => _readSignal.GetResult(token); - - ValueTaskSourceStatus IValueTaskSource.GetStatus(short token) => _readSignal.GetStatus(token); - - void IValueTaskSource.OnCompleted(Action continuation, object? state, short token, ValueTaskSourceOnCompletedFlags flags) - => _readSignal.OnCompleted(continuation, state, token, flags); -} diff --git a/Minima/Connection/Connection.Incremental.cs b/Minima/Connection/Connection.Incremental.cs new file mode 100644 index 0000000..bb3dad7 --- /dev/null +++ b/Minima/Connection/Connection.Incremental.cs @@ -0,0 +1,61 @@ +using System.Runtime.InteropServices; +using Minima.Utils; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Minima; + +/// +/// Incremental-mode (IOU_PBUF_RING_INC) per-connection buffer-ring state. +/// Each connection owns its own ring + slab; one buffer accumulates this +/// connection's byte stream across many recvs. The reactor (Reactor.Incremental) +/// drives setup/teardown and the refcounted recycle; this partial just holds the +/// state and routes a handler return to the right reactor entry point. +/// +/// All of these stay allocated across pool reuse and are freed in Dispose(). +/// +public sealed unsafe partial class Connection +{ + internal byte* BufRing; // kernel-shared ring control area + internal byte* BufSlab; // this connection's recv slab + internal ushort Bgid; + internal uint BufRingMask; + internal int BufRingEntries; + internal bool IncrementalMode; + + internal int[]? CumOffset; // per-bid: byte offset where the next slice begins + internal int[]? RefCount; // per-bid: outstanding handler refs + internal bool[]? KernelDone; // per-bid: kernel finished appending (no F_BUF_MORE) + + internal int Generation => Volatile.Read(ref _generation); + + /// + /// Called by the handler to hand a consumed recv buffer back. Routes by mode: + /// incremental returns carry (fd, gen, bid) for refcounted recycle; the shared + /// path returns the bare bid to the reactor's single buf_ring. + /// + public void ReturnBuffer(in SpscRecvRing.Item item) + { + if (IncrementalMode) + { + _reactor.EnqueueReturnQIncremental(ClientFd, item.Gen, item.Bid); + } + else + { + _reactor.EnqueueReturnQ(item.Bid); + } + } + + private void DisposeIncremental() + { + if (BufRing != null) + { + NativeMemory.AlignedFree(BufRing); + BufRing = null; + } + if (BufSlab != null) + { + NativeMemory.AlignedFree(BufSlab); + BufSlab = null; + } + } +} diff --git a/Minima/Connection/Connection.Read.cs b/Minima/Connection/Connection.Read.cs new file mode 100644 index 0000000..117739f --- /dev/null +++ b/Minima/Connection/Connection.Read.cs @@ -0,0 +1,163 @@ +using System.Threading.Tasks.Sources; +using Minima.Utils; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Minima; + +/// +/// Per-connection state. The handler may run on any thread (e.g. resumed by +/// a thread-pool timer); reactor-only side effects are funnelled through the +/// MPSC queues on `Reactor`. Coordination uses Interlocked.Exchange on the +/// arm flags and a sticky `_pending` to close the lost-wakeup race. +/// +/// Lifetime is pool-managed: the reactor pops a Connection on accept (or new +/// one if pool is empty), and pushes it back on teardown after `Clear()`. The +/// `_generation` field is bumped on each `Clear` so stale `ValueTask` tokens +/// from a previous connection life are detectable and return `Closed()` +/// instead of leaking the new tenant's state. +/// +public sealed unsafe partial class Connection : IValueTaskSource +{ + internal Connection SetFd(int fd) + { + ClientFd = fd; + return this; + } + + private ManualResetValueTaskSourceCore _readSignal; + private int _armed; + private int _pending; + private int _closed; + + private readonly SpscRecvRing _recv = new(capacityPow2: 16); + + public ValueTask ReadAsync() + { + if (!_recv.IsEmpty() || Volatile.Read(ref _pending) == 1) + { + Volatile.Write(ref _pending, 0); + return new ValueTask( + new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + + if (Volatile.Read(ref _closed) != 0) + { + return new ValueTask(RecvSnapshot.Closed()); + } + + if (Interlocked.Exchange(ref _armed, 1) == 1) + { + throw new InvalidOperationException("ReadAsync already armed."); + } + + // Snapshot the generation as the IVTS token so a future Clear() can + // invalidate this awaiter if the connection gets pool-recycled. + int gen = Volatile.Read(ref _generation); + + // Race recovery: re-check between arming and returning the IVTS task. + if (!_recv.IsEmpty() || Volatile.Read(ref _pending) == 1 || Volatile.Read(ref _closed) != 0) + { + Volatile.Write(ref _pending, 0); + Interlocked.Exchange(ref _armed, 0); + + return new ValueTask( + new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + + return new ValueTask(this, (short)gen); + } + + public bool TryGetItem(in RecvSnapshot snap, out SpscRecvRing.Item item) + => _recv.TryDequeueUntil(snap.Tail, out item); + + public void ResetRead() => _readSignal.Reset(); + + public void Complete(int res, ushort bid, bool hasBuffer, byte* ptr) + { + if (!_recv.TryEnqueue(new SpscRecvRing.Item + { + Ptr = ptr, + Bid = bid, + Len = res, + HasBuffer = hasBuffer, + Gen = (ushort)Volatile.Read(ref _generation) + })) + { + Console.Error.WriteLine("[conn] recv queue overflow."); + if (hasBuffer) + { + _reactor.ReturnBufferDirect(bid); + } + Volatile.Write(ref _closed, 1); + } + + if (Interlocked.Exchange(ref _armed, 0) == 1) + { + _readSignal.SetResult(new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + else + { + Volatile.Write(ref _pending, 1); + } + } + + internal void DrainRecv() + { + // Return any buffer IDs still sitting in the SPSC ring (handler exited + // before draining them, or a recv arrived after _closed was set). + while (_recv.TryDequeue(out SpscRecvRing.Item item)) + { + if (item.HasBuffer) + { + _reactor.ReturnBufferDirect(item.Bid); + } + } + } + + // ========================================================================= + // IValueTaskSource plumbing — token (= snapshot of `_generation` at await + // time) is compared against the current `_generation` to detect stale + // awaiters from before a Clear()/pool reuse. Stale awaiters get a + // sentinel result rather than the new tenant's state. + // + // For the actual IVTS dispatch we pass `_readSignal.Version` / + // `_flushSignal.Version` to the underlying core (not `token`) because the + // core's version is bumped by ResetRead/CompleteFlush mid-life and is + // unrelated to the cross-life generation guard. + // ========================================================================= + + RecvSnapshot IValueTaskSource.GetResult(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return RecvSnapshot.Closed(); + } + + return _readSignal.GetResult(_readSignal.Version); + } + + ValueTaskSourceStatus IValueTaskSource.GetStatus(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return ValueTaskSourceStatus.Succeeded; + } + + return _readSignal.GetStatus(_readSignal.Version); + } + + void IValueTaskSource.OnCompleted(Action continuation, object? state, short token, ValueTaskSourceOnCompletedFlags flags) + { + if (token != (short)Volatile.Read(ref _generation)) + { + // Stale — run the continuation now so the awaiter unblocks and + // gets RecvSnapshot.Closed() from GetResult. + continuation(state); + + return; + } + + _readSignal.OnCompleted(continuation, state, _readSignal.Version, flags); + } +} diff --git a/Minima/Connection/Connection.Write.cs b/Minima/Connection/Connection.Write.cs new file mode 100644 index 0000000..a2bab99 --- /dev/null +++ b/Minima/Connection/Connection.Write.cs @@ -0,0 +1,168 @@ +using System.Buffers; +using System.Threading.Tasks.Sources; +using Minima.Utils; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Minima; + +public sealed unsafe partial class Connection : IValueTaskSource, IBufferWriter +{ + private readonly int _writeSlabSize; + internal byte* WriteBuffer; + internal int WriteHead; + internal int WriteTail; + internal int WriteInFlight; + + private readonly UnmanagedMemoryManager _manager; + + private ManualResetValueTaskSourceCore _flushSignal = new() + { + RunContinuationsAsynchronously = false, + }; + private int _flushArmed; + private int _flushInProgress; + + // IBufferWrite +#region IBufferWrite + + public Memory GetMemory(int sizeHint = 0) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + int remaining = _writeSlabSize - WriteTail; + if (sizeHint > remaining) + { + throw new InvalidOperationException("Buffer too small."); + } + + return _manager.Memory.Slice(WriteTail, remaining); + } + + public Span GetSpan(int sizeHint = 0) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + if (WriteTail + sizeHint > _writeSlabSize) + { + throw new InvalidOperationException("Write buffer too small."); + } + + return new Span(WriteBuffer + WriteTail, _writeSlabSize - WriteTail); + } + + public void Advance(int count) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + WriteTail += count; + } + +#endregion + + // Write to the inner buffer + public void Write(ReadOnlySpan source) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + int len = source.Length; + if (WriteTail + len > _writeSlabSize) + { + throw new InvalidOperationException("Write buffer too small."); + } + + source.CopyTo(new Span(WriteBuffer + WriteTail, len)); + WriteTail += len; + } + + // Flush inner buffer data to the kernel + public ValueTask FlushAsync() + { + if (Interlocked.Exchange(ref _flushInProgress, 1) == 1) + { + throw new InvalidOperationException("FlushAsync already in progress."); + } + + int target = WriteTail; + if (target == 0) + { + Volatile.Write(ref _flushInProgress, 0); + + return default; + } + + if (Interlocked.Exchange(ref _flushArmed, 1) == 1) + { + throw new InvalidOperationException("FlushAsync already armed."); + } + + _flushSignal.Reset(); + WriteInFlight = target; + + int gen = Volatile.Read(ref _generation); + + _reactor.EnqueueFlush(ClientFd); + + return new ValueTask(this, (short)gen); + } + + // Signal the FlushAsync was completed, called by the reactor's dispatcher send branch + internal void CompleteFlush() + { + WriteHead = 0; + WriteTail = 0; + WriteInFlight = 0; + Volatile.Write(ref _flushInProgress, 0); + Interlocked.Exchange(ref _flushArmed, 0); + + _flushSignal.SetResult(true); + } + + // IValueTaskSource +#region IValueTaskSource + + void IValueTaskSource.GetResult(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return; + } + + _flushSignal.GetResult(_flushSignal.Version); + } + + ValueTaskSourceStatus IValueTaskSource.GetStatus(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return ValueTaskSourceStatus.Succeeded; + } + + return _flushSignal.GetStatus(_flushSignal.Version); + } + + void IValueTaskSource.OnCompleted(Action continuation, object? state, short token, ValueTaskSourceOnCompletedFlags flags) + { + if (token != (short)Volatile.Read(ref _generation)) + { + continuation(state); + + return; + } + _flushSignal.OnCompleted(continuation, state, _flushSignal.Version, flags); + } + +#endregion +} \ No newline at end of file diff --git a/Minima/Connection/Connection.cs b/Minima/Connection/Connection.cs new file mode 100644 index 0000000..291a73a --- /dev/null +++ b/Minima/Connection/Connection.cs @@ -0,0 +1,89 @@ +using System.Runtime.InteropServices; +using Minima.Utils; + +namespace Minima; + +public sealed unsafe partial class Connection +{ + private readonly Reactor _reactor; + + public int ClientFd { get; private set; } + + // Bumped on Clear(); the low 16 bits are used as the IVTS token so stale + // awaiters can be detected after pool reuse. + private int _generation; + + public Connection(Reactor reactor, int fd, int writeSlabSize = 1024 * 16) + { + _reactor = reactor; + ClientFd = fd; + _writeSlabSize = writeSlabSize; + WriteBuffer = (byte*)NativeMemory.AlignedAlloc((nuint)writeSlabSize, 64); + + _manager = new UnmanagedMemoryManager(WriteBuffer, writeSlabSize); + } + + // ========================================================================= + // Pool lifecycle — invoked from Reactor.Dispatch's recv/send error paths. + // Reactor-thread only. + // + // teardown: MarkClosed() → wake awaiters with closed=1 + // DrainRecv() → return any in-flight buf_ring items + // close(fd) + // Clear() → reset state, bump _generation + // push to pool, OR Dispose() if pool is full + // ========================================================================= + + public void MarkClosed() + { + Volatile.Write(ref _closed, 1); + + if (Interlocked.Exchange(ref _armed, 0) == 1) + { + _readSignal.SetResult(new RecvSnapshot(_recv.SnapshotTail(), isClosed: true)); + } + else + { + Volatile.Write(ref _pending, 1); + } + + if (Interlocked.Exchange(ref _flushArmed, 0) == 1) + { + Volatile.Write(ref _flushInProgress, 0); + _flushSignal.SetResult(true); + } + } + + internal void Clear() + { + // Bump generation first — readers of IVTS plumbing observe this via + // Volatile.Read and stale tokens get RecvSnapshot.Closed() / no-op. + Interlocked.Increment(ref _generation); + + Volatile.Write(ref _armed, 0); + Volatile.Write(ref _pending, 0); + Volatile.Write(ref _closed, 0); + Volatile.Write(ref _flushArmed, 0); + Volatile.Write(ref _flushInProgress, 0); + + WriteHead = 0; + WriteTail = 0; + WriteInFlight = 0; + + _readSignal.Reset(); + _flushSignal.Reset(); + + _recv.Reset(); // discard any leftover SPSC items + IncrementalMode = false; // per-conn ring (if any) was torn down before Clear + } + + public void Dispose() + { + if (WriteBuffer != null) + { + NativeMemory.AlignedFree(WriteBuffer); + WriteBuffer = null; + } + DisposeIncremental(); + } +} \ No newline at end of file diff --git a/Minima/Connection/ConnectionDualPipe.cs b/Minima/Connection/ConnectionDualPipe.cs new file mode 100644 index 0000000..7b40e74 --- /dev/null +++ b/Minima/Connection/ConnectionDualPipe.cs @@ -0,0 +1,16 @@ +using System.IO.Pipelines; + +namespace Minima; + +public sealed class ConnectionDualPipe : IDuplexPipe +{ + public PipeReader Input { get; } + public PipeWriter Output { get; } + + public ConnectionDualPipe(Connection connection) + { + ArgumentNullException.ThrowIfNull(connection); + Input = new ConnectionPipeReader(connection); + Output = new ConnectionPipeWriter(connection); + } +} \ No newline at end of file diff --git a/Minima/Connection/ConnectionPipeReader.cs b/Minima/Connection/ConnectionPipeReader.cs new file mode 100644 index 0000000..14d9ca6 --- /dev/null +++ b/Minima/Connection/ConnectionPipeReader.cs @@ -0,0 +1,181 @@ +using System.Buffers; +using System.IO.Pipelines; +using Minima.Utils; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Minima; + +/// +/// Adapts Minima's raw read API (ReadAsync + TryGetItem +/// + ReturnBuffer) to a standard . Recv buffers are +/// exposed zero-copy as a ReadOnlySequence<byte> (one segment per buffer) +/// and held until AdvanceTo consumes them, at which point fully-consumed buffers +/// are returned to the reactor. +/// +/// Convenience/compat layer for PipeReader consumers — the raw ReadAsync/ +/// TryGetItem path stays the faster one (this adds held-buffer + sequence +/// bookkeeping per read). +/// +public sealed class ConnectionPipeReader : PipeReader +{ + private readonly Connection _conn; + private readonly List _held = new(16); + private ReadOnlySequence _lastSequence; + + private bool _completed; + private bool _cancelRequested; + private bool _connectionClosed; + + private readonly struct Held + { + public readonly ReadOnlyMemory Memory; + public readonly SpscRecvRing.Item Item; + + public Held(ReadOnlyMemory memory, SpscRecvRing.Item item) + { + Memory = memory; + Item = item; + } + + public Held WithMemory(ReadOnlyMemory memory) => new(memory, Item); + } + + public ConnectionPipeReader(Connection connection) + { + _conn = connection ?? throw new ArgumentNullException(nameof(connection)); + } + + public override async ValueTask ReadAsync(CancellationToken cancellationToken = default) + { + ThrowIfCompleted(); + + if (_cancelRequested) + { + _cancelRequested = false; + return new ReadResult(BuildSequence(), isCanceled: true, isCompleted: _connectionClosed); + } + + // Anything still held from a previous read that wasn't fully consumed. + if (_held.Count > 0) + return new ReadResult(BuildSequence(), isCanceled: false, isCompleted: _connectionClosed); + + if (_connectionClosed) + return new ReadResult(default, isCanceled: false, isCompleted: true); + + RecvSnapshot snap = await _conn.ReadAsync(); + + while (_conn.TryGetItem(snap, out SpscRecvRing.Item item)) + { + if (item.HasBuffer) + _held.Add(new Held(item.AsMemoryManager().Memory, item)); + } + + _conn.ResetRead(); + + if (snap.IsClosed) + _connectionClosed = true; + + if (_cancelRequested) + { + _cancelRequested = false; + return new ReadResult(BuildSequence(), isCanceled: true, isCompleted: _connectionClosed); + } + + return new ReadResult(BuildSequence(), isCanceled: false, isCompleted: _connectionClosed); + } + + public override bool TryRead(out ReadResult result) + { + ThrowIfCompleted(); + + if (_held.Count > 0) + { + result = new ReadResult(BuildSequence(), isCanceled: false, isCompleted: _connectionClosed); + return true; + } + + if (_connectionClosed) + { + result = new ReadResult(default, isCanceled: false, isCompleted: true); + return true; + } + + result = default; + return false; + } + + public override void AdvanceTo(SequencePosition consumed) => AdvanceTo(consumed, consumed); + + public override void AdvanceTo(SequencePosition consumed, SequencePosition examined) + { + if (_held.Count == 0) + return; + + long consumedBytes = _lastSequence.Slice(0, consumed).Length; + + while (_held.Count > 0 && consumedBytes > 0) + { + Held seg = _held[0]; + int available = seg.Memory.Length; + + if (consumedBytes >= available) + { + // Whole buffer consumed — return it to the reactor. + _conn.ReturnBuffer(seg.Item); + _held.RemoveAt(0); + consumedBytes -= available; + } + else + { + // Partial — keep the unconsumed tail of this buffer. + _held[0] = seg.WithMemory(seg.Memory[(int)consumedBytes..]); + consumedBytes = 0; + } + } + } + + public override void CancelPendingRead() => _cancelRequested = true; + + public override void Complete(Exception? exception = null) + { + if (_completed) + return; + + _completed = true; + + for (int i = 0; i < _held.Count; i++) + _conn.ReturnBuffer(_held[i].Item); + + _held.Clear(); + } + + private ReadOnlySequence BuildSequence() + { + if (_held.Count == 0) + { + _lastSequence = default; + return _lastSequence; + } + + if (_held.Count == 1) + { + _lastSequence = new ReadOnlySequence(_held[0].Memory); + return _lastSequence; + } + + var head = new RingSegment(_held[0].Memory, _held[0].Item.Bid); + RingSegment tail = head; + + for (int i = 1; i < _held.Count; i++) + tail = tail.Append(_held[i].Memory, _held[i].Item.Bid); + + _lastSequence = new ReadOnlySequence(head, 0, tail, tail.Memory.Length); + return _lastSequence; + } + + private void ThrowIfCompleted() + { + if (_completed) + throw new InvalidOperationException("Reading is not allowed after the reader was completed."); + } +} diff --git a/Minima/Connection/ConnectionPipeWriter.cs b/Minima/Connection/ConnectionPipeWriter.cs new file mode 100644 index 0000000..56be337 --- /dev/null +++ b/Minima/Connection/ConnectionPipeWriter.cs @@ -0,0 +1,63 @@ +using System.IO.Pipelines; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Minima; + +/// +/// Adapts Minima's write API (GetMemory/GetSpan/Advance/ +/// FlushAsync) to a standard , so PipeWriter-based code +/// can write responses through the connection's per-connection slab. +/// A thin wrapper — all the work lives in Connection. +/// +public sealed class ConnectionPipeWriter : PipeWriter +{ + private readonly Connection _conn; + private bool _completed; + private bool _cancelRequested; + private long _unflushed; + + public ConnectionPipeWriter(Connection connection) + { + _conn = connection ?? throw new ArgumentNullException(nameof(connection)); + } + + public override bool CanGetUnflushedBytes => true; + public override long UnflushedBytes => _unflushed; + + public override Memory GetMemory(int sizeHint = 0) => _conn.GetMemory(sizeHint); + + public override Span GetSpan(int sizeHint = 0) => _conn.GetSpan(sizeHint); + + public override void Advance(int bytes) + { + _unflushed += bytes; + _conn.Advance(bytes); + } + + public override ValueTask FlushAsync(CancellationToken cancellationToken = default) + { + if (_cancelRequested) + { + _cancelRequested = false; + return new ValueTask(new FlushResult(isCanceled: true, isCompleted: _completed)); + } + + _unflushed = 0; + ValueTask inner = _conn.FlushAsync(); + + if (inner.IsCompletedSuccessfully) + return new ValueTask(new FlushResult(isCanceled: false, isCompleted: _completed)); + + return AwaitFlush(inner); + } + + private async ValueTask AwaitFlush(ValueTask inner) + { + await inner; + return new FlushResult(isCanceled: false, isCompleted: _completed); + } + + public override void CancelPendingFlush() => _cancelRequested = true; + + public override void Complete(Exception? exception = null) => _completed = true; +} diff --git a/Minima/Connection/RecvSnapshot.cs b/Minima/Connection/RecvSnapshot.cs new file mode 100644 index 0000000..015afc4 --- /dev/null +++ b/Minima/Connection/RecvSnapshot.cs @@ -0,0 +1,15 @@ +namespace Minima; + +public readonly struct RecvSnapshot +{ + public readonly long Tail; + public readonly bool IsClosed; + + public RecvSnapshot(long tail, bool isClosed) + { + Tail = tail; + IsClosed = isClosed; + } + + public static RecvSnapshot Closed() => new(0, isClosed: true); +} \ No newline at end of file diff --git a/Minima/Program.cs b/Minima/Program.cs index c9d016e..fa2782c 100644 --- a/Minima/Program.cs +++ b/Minima/Program.cs @@ -1,4 +1,6 @@ -using System.Runtime.InteropServices; +using System.Buffers; +using System.IO.Pipelines; +using Minima.Utils; namespace Minima; @@ -11,39 +13,28 @@ namespace Minima; /// internal static unsafe class Program { - private const ushort Port = 8080; - private const uint RingEntries = 8192; - internal const int BufferSize = 32 * 1024; - - // user_data layout: kind in high 32 bits, fd in low 32 bits. - internal const ulong KindAccept = 1UL << 32; - internal const ulong KindRecv = 2UL << 32; - internal const ulong KindSend = 3UL << 32; - - // Pre-built HTTP/1.1 response shared across all reactors - internal static byte* s_responseBytes; - internal static int s_responseLen; - - private static ReadOnlySpan s_response => "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nContent-Length: 2\r\n\r\nok"u8; + internal static ReadOnlySpan Response => + "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nContent-Length: 2\r\n\r\nok"u8; private static int Main() { - s_responseLen = s_response.Length; - s_responseBytes = (byte*)NativeMemory.Alloc((nuint)s_responseLen); - s_response.CopyTo(new Span(s_responseBytes, s_responseLen)); + // All tunables live in ServerConfig — override the defaults here. + var config = new ServerConfig() + { + UsePipe = false, + }; - var n = 12; - Console.WriteLine($"[Minima] starting {n} reactors on port {Port}"); + Console.WriteLine($"[Minima] starting {config.ReactorCount} reactors on port {config.Port} (incremental={config.Incremental})"); - var threads = new Thread[n]; - for (var i = 0; i < n; i++) + var threads = new Thread[config.ReactorCount]; + for (var i = 0; i < config.ReactorCount; i++) { - var reactor = new Reactor(i, Port, RingEntries); - + var reactor = new Reactor(i, config); + threads[i] = new Thread(reactor.Run) { - Name = $"reactor-{i}", - IsBackground = false + Name = $"reactor-{i}", + IsBackground = false }; threads[i].Start(); } @@ -52,14 +43,14 @@ private static int Main() { t.Join(); } - + return 0; } } internal static class Handler { - public static async Task HandleAsync(Reactor reactor, int fd, Connection conn) + public static async Task HandleAsync(Reactor reactor, Connection conn) { try { @@ -76,14 +67,21 @@ public static async Task HandleAsync(Reactor reactor, int fd, Connection conn) // data is now usable with any BCL Memory/async API _ = data.Length; - reactor.ReturnBuffer(mem.BufferId); + // Cross-thread safe and mode-agnostic: routes to the + // shared-ring return or the incremental refcounted return. + conn.ReturnBuffer(in item); } - conn.QueueResponse(fd); } + // One response per recv burst — accumulate in the connection's + // per-connection write slab, then submit and await ack. + conn.Write(Program.Response); + await conn.FlushAsync(); + if (snap.IsClosed) { - conn.Close(fd); + // Reactor already owns teardown (Connections.Remove + close + // happens in Dispatch's recv-error branch); we just exit. return; } @@ -92,8 +90,50 @@ public static async Task HandleAsync(Reactor reactor, int fd, Connection conn) } catch (Exception ex) { - Console.Error.WriteLine($"[r{reactor.Id}] handler crash on fd={fd}: {ex}"); - conn.Close(fd); + Console.Error.WriteLine($"[r{reactor.Id}] handler crash on fd={conn.ClientFd}: {ex}"); + // Reactor will clean the connection up via the recv-error path + // (or SPSC overflow) on the next CQE for this fd. + } + } + + // PipeReader/PipeWriter variant — same behavior, driven through the BCL + // pipe adapters instead of the raw ReadAsync/TryGetItem/Write API. + public static async Task HandlePipeAsync(Reactor reactor, Connection conn) + { + var reader = new ConnectionPipeReader(conn); + var writer = new ConnectionPipeWriter(conn); + + try + { + while (true) + { + ReadResult read = await reader.ReadAsync(); + ReadOnlySequence buffer = read.Buffer; + + if (!buffer.IsEmpty) + { + // A real server would parse requests out of `buffer` here. + writer.Write(Program.Response); + await writer.FlushAsync(); + } + + // Consume everything we got; AdvanceTo returns the recv buffers. + reader.AdvanceTo(buffer.End); + + if (read.IsCompleted) + { + break; + } + } + } + catch (Exception ex) + { + Console.Error.WriteLine($"[r{reactor.Id}] pipe handler crash on fd={conn.ClientFd}: {ex}"); + } + finally + { + reader.Complete(); + writer.Complete(); } } } diff --git a/Minima/Reactor.cs b/Minima/Reactor.cs deleted file mode 100644 index 351ca6b..0000000 --- a/Minima/Reactor.cs +++ /dev/null @@ -1,258 +0,0 @@ -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -using static Minima.Native; -using static Minima.Program; -// ReSharper disable SuggestVarOrType_BuiltInTypes - -namespace Minima; - -/// -/// One reactor = one thread + one io_uring + one listening socket (SO_REUSEPORT) -/// + one connection map. Fully isolated from other reactors; the kernel -/// load-balances incoming connections across all SO_REUSEPORT listeners. -/// -internal sealed unsafe class Reactor -{ - public readonly int Id; - public Ring Ring = null!; // created on the reactor's own thread (DEFER_TASKRUN requires same-thread setup+enter) - public readonly Dictionary Connections = new(); - - private int _listenFd; - private readonly ushort _port; - private readonly uint _ringEntries; - - // Provided-buffer ring (one per reactor, shared by all its connections). - private const ushort BgId = 1; - private const uint BufferRingEntries = 4096; // power of two - private byte* _bufRing; // io_uring_buf_ring (kernel-shared) - private byte* _bufSlab; // contiguous slab of recv buffers - private uint _bufRingMask; - private ushort _bufRingTail; - - public Reactor(int id, ushort port, uint ringEntries) - { - Id = id; - _port = port; - _ringEntries = ringEntries; - } - - // Buffer ring - - private void InitBufferRing() - { - nuint ringBytes = (nuint)BufferRingEntries * 16; - _bufRing = (byte*)NativeMemory.AlignedAlloc(ringBytes, 4096); - NativeMemory.Clear(_bufRing, ringBytes); - - nuint slabBytes = BufferRingEntries * (nuint)BufferSize; - _bufSlab = (byte*)NativeMemory.AlignedAlloc(slabBytes, 64); - - _bufRingMask = BufferRingEntries - 1; - - var reg = new io_uring_buf_reg { - ring_addr = (ulong)_bufRing, - ring_entries = BufferRingEntries, - bgid = BgId, - }; - - int ret = io_uring_register(Ring.Fd, IORING_REGISTER_PBUF_RING, ®, 1); - if (ret < 0) - { - int err = Marshal.GetLastPInvokeError(); - - throw new InvalidOperationException($"register pbuf_ring failed: ret={ret} errno={err}"); - } - - // Populate every slot once. Slot 0 overlaps with the ring's tail field - // at offset 14, but we only write addr/len/bid (offsets 0..13) so tail - // stays at zero until we set it explicitly. - for (ushort bid = 0; bid < BufferRingEntries; bid++) { - byte* slot = _bufRing + (uint)bid * 16; - *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)BufferSize); - *(uint*)(slot + 8) = BufferSize; - *(ushort*)(slot + 12) = bid; - } - _bufRingTail = (ushort)BufferRingEntries; - - Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); - } - - public void ReturnBuffer(ushort bid) - { - byte* slot = _bufRing + (_bufRingTail & _bufRingMask) * 16; - *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)BufferSize); - *(uint*)(slot + 8) = BufferSize; - *(ushort*)(slot + 12) = bid; - _bufRingTail++; - - Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); - } - - public void Run() - { - Ring = Ring.Create(_ringEntries); - _listenFd = OpenReusePortListener(_port); - - InitBufferRing(); - - Console.WriteLine($"[r{Id}] listening on 0.0.0.0:{_port}"); - SubmitAcceptMultishot(); - - while (true) - { - int rc = Ring.SubmitAndWait(1); - if (rc < 0 && rc != -4 /* EINTR */) - { - Console.Error.WriteLine($"[r{Id}] io_uring_enter failed: {rc}"); - break; - } - - while (Ring.TryGetCqe(out IoUringCqe cqe)) - { - Dispatch(in cqe); - Ring.CqeSeen(); - } - } - - close(_listenFd); - Ring.Dispose(); - } - - private void Dispatch(in IoUringCqe cqe) - { - ulong kind = cqe.user_data & 0xffffffff_00000000UL; - int fd = (int)(cqe.user_data & 0xffffffffUL); - bool more = (cqe.flags & IORING_CQE_F_MORE) != 0; - - if (kind == KindAccept) - { - if (cqe.res >= 0) - { - int clientFd = cqe.res; - var conn = new Connection(this); - Connections[clientFd] = conn; - SubmitRecvMultishot(clientFd); - - _ = Handler.HandleAsync(this, clientFd, conn); - } - else - { - Console.Error.WriteLine($"[r{Id}] accept error: {cqe.res}"); - } - // Multishot accept stays armed; only re-arm if the kernel terminated it. - if (!more) - { - SubmitAcceptMultishot(); - } - } - else if (kind == KindRecv) - { - bool hasBuf = (cqe.flags & IORING_CQE_F_BUFFER) != 0; - ushort bid = hasBuf ? (ushort)(cqe.flags >> IORING_CQE_BUFFER_SHIFT) : (ushort)0; - - if (!Connections.TryGetValue(fd, out var conn)) - { - if (hasBuf) ReturnBuffer(bid); - - return; - } - - byte* ptr = hasBuf ? _bufSlab + (nuint)bid * (nuint)BufferSize : null; - conn.Complete(cqe.res, bid, hasBuf, ptr); - - if (!more && cqe.res > 0) - { - SubmitRecvMultishot(fd); - } - } - else if (kind == KindSend) - { - if (Connections.TryGetValue(fd, out var conn) && cqe.res <= 0) - { - conn.MarkClosed(); - } - } - } - - private IoUringSqe* GetSqeOrFlush() - { - IoUringSqe* sqe = Ring.GetSqe(); - if (sqe != null) - { - return sqe; - } - - Ring.SubmitAndWait(0); - sqe = Ring.GetSqe(); - - if (sqe == null) - { - throw new InvalidOperationException("SQ full after flush"); - } - - return sqe; - } - - private void SubmitAcceptMultishot() - { - IoUringSqe* sqe = GetSqeOrFlush(); - Unsafe.InitBlockUnaligned(sqe, 0, 64); - sqe->opcode = IORING_OP_ACCEPT; - sqe->ioprio = IORING_ACCEPT_MULTISHOT; - sqe->fd = _listenFd; - sqe->user_data = KindAccept | (uint)_listenFd; - } - - public void SubmitRecvMultishot(int fd) - { - IoUringSqe* sqe = GetSqeOrFlush(); - Unsafe.InitBlockUnaligned(sqe, 0, 64); - sqe->opcode = IORING_OP_RECV; - sqe->flags = IOSQE_BUFFER_SELECT; - sqe->ioprio = IORING_RECV_MULTISHOT; - sqe->fd = fd; - sqe->buf_index = BgId; // same offset as buf_group in the kernel union - sqe->user_data = KindRecv | (uint)fd; - } - - public void SubmitSend(int fd, byte* buf, uint len) - { - IoUringSqe* sqe = GetSqeOrFlush(); - Unsafe.InitBlockUnaligned(sqe, 0, 64); - sqe->opcode = IORING_OP_SEND; - sqe->fd = fd; - sqe->addr = (ulong)buf; - sqe->len = len; - sqe->user_data = KindSend | (uint)fd; - } - - private static int OpenReusePortListener(ushort port) - { - int fd = socket(AF_INET, SOCK_STREAM, 0); - if (fd < 0) - { - throw new InvalidOperationException($"socket failed: {fd}"); - } - - int one = 1; - setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(int)); - setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(int)); - - sockaddr_in addr = default; - addr.sin_family = AF_INET; - addr.sin_port = Htons(port); - addr.sin_addr.s_addr = 0; // 0.0.0.0 - - if (bind(fd, &addr, (uint)sizeof(sockaddr_in)) < 0) - { - throw new InvalidOperationException("bind failed"); - } - - if (listen(fd, 128) < 0) - { - throw new InvalidOperationException("listen failed"); - } - - return fd; - } -} diff --git a/Minima/Reactor/Reactor.Incremental.cs b/Minima/Reactor/Reactor.Incremental.cs new file mode 100644 index 0000000..6da3f68 --- /dev/null +++ b/Minima/Reactor/Reactor.Incremental.cs @@ -0,0 +1,302 @@ +using System.Runtime.InteropServices; +using Minima.Utils; +using static Minima.Native; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Minima; + +/// +/// Incremental-buffer (IOU_PBUF_RING_INC) path. Each connection gets its own +/// buffer ring: one buffer accumulates that connection's byte stream across many +/// recvs, so buffers are recycled only when the kernel is done appending AND the +/// handler has returned every slice it was handed. Selected per reactor by the +/// `_incremental` flag; the shared-ring path in Reactor.cs is untouched. +/// +public sealed unsafe partial class Reactor +{ + private Stack? _freeGids; + private Mpsc? _returnQInc; + + private void InitIncremental() + { + // Per-connection rings; no shared ring. GID 1 reserved; per-conn GIDs 2..MaxConnections+1. + _freeGids = new Stack(MaxConnections); + for (int g = MaxConnections + 1; g >= 2; g--) + _freeGids.Push((ushort)g); + + _returnQInc = new Mpsc(1 << 16); + } + + private ushort AllocGid() => _freeGids!.Pop(); + private void FreeGid(ushort gid) => _freeGids!.Push(gid); + + // ========================================================================= + // Per-connection ring lifecycle + // ========================================================================= + + private void SetupConnectionBufRing(Connection conn) + { + ushort gid = AllocGid(); + int entries = ConnBufRingEntries; + + // Ring control area + slab + tracking arrays are allocated once and + // reused across pool lives; only the kernel registration is per-life. + if (conn.BufRing == null) + conn.BufRing = (byte*)NativeMemory.AlignedAlloc((nuint)entries * 16, 4096); + NativeMemory.Clear(conn.BufRing, (nuint)entries * 16); + + if (conn.BufSlab == null) + conn.BufSlab = (byte*)NativeMemory.AlignedAlloc((nuint)entries * (nuint)IncRecvBufferSize, 64); + + conn.CumOffset ??= new int[entries]; + conn.RefCount ??= new int[entries]; + conn.KernelDone ??= new bool[entries]; + Array.Clear(conn.CumOffset, 0, entries); + Array.Clear(conn.RefCount, 0, entries); + Array.Clear(conn.KernelDone, 0, entries); + + var reg = new io_uring_buf_reg + { + ring_addr = (ulong)conn.BufRing, + ring_entries = (uint)entries, + bgid = gid, + flags = IOU_PBUF_RING_INC, + }; + int ret = io_uring_register(Ring.Fd, IORING_REGISTER_PBUF_RING, ®, 1); + if (ret < 0) + throw new InvalidOperationException($"register pbuf_ring (inc) failed: ret={ret} gid={gid}"); + + conn.Bgid = gid; + conn.BufRingEntries = entries; + conn.BufRingMask = (uint)(entries - 1); + conn.IncrementalMode = true; + + for (ushort bid = 0; bid < entries; bid++) + { + byte* slot = conn.BufRing + (uint)bid * 16; + *(ulong*)(slot + 0) = (ulong)(conn.BufSlab + bid * (nuint)IncRecvBufferSize); + *(uint*)(slot + 8) = IncRecvBufferSize; + *(ushort*)(slot + 12) = bid; + } + Volatile.Write(ref *(ushort*)(conn.BufRing + 14), (ushort)entries); + } + + private void TeardownConnectionBufRing(Connection conn) + { + if (conn.IncrementalMode) + { + var reg = new io_uring_buf_reg { bgid = conn.Bgid }; + io_uring_register(Ring.Fd, IORING_UNREGISTER_PBUF_RING, ®, 1); + FreeGid(conn.Bgid); + } + // BufRing / BufSlab / arrays stay allocated for pool reuse. + } + + // Re-add a fully-consumed buffer to its connection's ring (reactor-thread only). + private void ReturnConnectionBuffer(Connection conn, ushort bid) + { + conn.CumOffset![bid] = 0; + conn.RefCount![bid] = 0; + conn.KernelDone![bid] = false; + + ushort tail = Volatile.Read(ref *(ushort*)(conn.BufRing + 14)); + byte* slot = conn.BufRing + (tail & conn.BufRingMask) * 16; + *(ulong*)(slot + 0) = (ulong)(conn.BufSlab + bid * (nuint)IncRecvBufferSize); + *(uint*)(slot + 8) = IncRecvBufferSize; + *(ushort*)(slot + 12) = bid; + Volatile.Write(ref *(ushort*)(conn.BufRing + 14), (ushort)(tail + 1)); + } + + // ========================================================================= + // Refcounted return path (handler → reactor), carrying (fd, gen, bid) + // ========================================================================= + + // (fd, gen, bid) packed into one ulong for the incremental return queue: + // fd in the high 32 bits, gen in the next 16, bid in the low 16. + private static ulong PackReturn(int fd, ushort gen, ushort bid) + => ((ulong)(uint)fd << 32) | ((ulong)gen << 16) | bid; + + private static void UnpackReturn(ulong packed, out int fd, out ushort gen, out ushort bid) + { + fd = (int)(packed >> 32); + gen = (ushort)((packed >> 16) & 0xFFFF); + bid = (ushort)(packed & 0xFFFF); + } + + public void EnqueueReturnQIncremental(int fd, ushort gen, ushort bid) + { + // Fast path: caller is the reactor thread (handler resumed inline). + if (Environment.CurrentManagedThreadId == _reactorThreadId) + { + ApplyReturnIncremental(fd, gen, bid); + return; + } + ulong packed = PackReturn(fd, gen, bid); + SpinWait sw = default; + while (!_returnQInc!.TryEnqueue(packed)) + sw.SpinOnce(); + WakeFdWrite(); + } + + private void DrainReturnQIncremental() + { + while (_returnQInc!.TryDequeue(out ulong packed)) + { + UnpackReturn(packed, out int fd, out ushort gen, out ushort bid); + ApplyReturnIncremental(fd, gen, bid); + } + } + + private void ApplyReturnIncremental(int fd, ushort gen, ushort bid) + { + if (!Connections.TryGetValue(fd, out var conn) || !conn.IncrementalMode) + { + return; // fd gone / ring already torn down + } + if ((ushort)conn.Generation != gen) + { + return; // stale return from a previous life (fd reused) + } + + conn.RefCount![bid]--; + if (conn.RefCount[bid] <= 0 && conn.KernelDone![bid]) + { + ReturnConnectionBuffer(conn, bid); + } + } + + // ========================================================================= + // Incremental reactor loop + // ========================================================================= + + private void LoopIncremental() + { + while (true) + { + DrainReturnQIncremental(); + DrainFlushQ(); + + int rc = Ring.SubmitAndWait(1); + if (rc < 0 && rc != -EINTR && rc != -EAGAIN && rc != -EBUSY) + { + Console.Error.WriteLine($"[r{Id}] io_uring_enter failed: {rc}"); + + break; + } + + uint ready = Ring.CqReady(); + for (uint i = 0; i < ready; i++) + { + DispatchIncremental(in Ring.CqeAt(i)); + } + Ring.CqAdvance(ready); + } + } + + private void DispatchIncremental(in IoUringCqe cqe) + { + ulong kind = cqe.user_data & 0xffffffff_00000000UL; + int fd = (int)(cqe.user_data & 0xffffffffUL); + bool more = (cqe.flags & IORING_CQE_F_MORE) != 0; + + if (kind == KindWake) + { + ulong drain; + read(_wakeFd, &drain, 8); + if (!more) + { + ArmWakePoll(); + } + return; + } + + if (kind == KindAccept) + { + if (cqe.res >= 0) + { + int clientFd = cqe.res; + SetNoDelay(clientFd); + Connection conn = _pool.TryPop(out var pooled) + ? pooled.SetFd(clientFd) + : new Connection(this, clientFd, _config.WriteSlabSize); + Connections[clientFd] = conn; + SetupConnectionBufRing(conn); + SubmitRecvMultishot(clientFd, conn.Bgid); + + _ = _config.UsePipe + ? Handler.HandlePipeAsync(this, conn) + : Handler.HandleAsync(this, conn); + } + else + { + Console.Error.WriteLine($"[r{Id}] accept error: {cqe.res}"); + } + if (!more) + { + SubmitAcceptMultishot(); + } + } + else if (kind == KindRecv) + { + bool hasBuf = (cqe.flags & IORING_CQE_F_BUFFER) != 0; + bool bufMore = (cqe.flags & IORING_CQE_F_BUF_MORE) != 0; + ushort bid = hasBuf ? (ushort)(cqe.flags >> IORING_CQE_BUFFER_SHIFT) : (ushort)0; + + if (cqe.res <= 0) + { + // Peer EOF / recv error — the whole per-conn ring is freed in Recycle. + if (Connections.Remove(fd, out var dyingConn)) + { + Recycle(dyingConn, fd); + } + + return; + } + + if (!Connections.TryGetValue(fd, out var conn)) + { + return; // straggler for a connection whose ring is already gone + } + + // Data lands at the buffer's running offset; the kernel keeps + // appending to this bid until the buffer is full (F_BUF_MORE clear). + byte* ptr = conn.BufSlab + (nuint)bid * (nuint)IncRecvBufferSize + (nuint)conn.CumOffset![bid]; + conn.CumOffset[bid] += cqe.res; + conn.RefCount![bid]++; + if (!bufMore || !more) + { + conn.KernelDone![bid] = true; + } + + conn.Complete(cqe.res, bid, hasBuffer: true, ptr); + + if (!more) + { + SubmitRecvMultishot(fd, conn.Bgid); + } + } + else if (kind == KindSend) + { + if (!Connections.TryGetValue(fd, out var conn)) + { + return; + } + if (cqe.res <= 0) + { + Connections.Remove(fd); + Recycle(conn, fd); + + return; + } + conn.WriteHead += cqe.res; + if (conn.WriteHead < conn.WriteInFlight) + { + SubmitSend(fd, conn.WriteBuffer + conn.WriteHead, (uint)(conn.WriteInFlight - conn.WriteHead)); + + return; + } + + conn.CompleteFlush(); + } + } +} diff --git a/Minima/Reactor/Reactor.cs b/Minima/Reactor/Reactor.cs new file mode 100644 index 0000000..8483a68 --- /dev/null +++ b/Minima/Reactor/Reactor.cs @@ -0,0 +1,532 @@ +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using Minima.Utils; +using static Minima.Native; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Minima; + +/// +/// One reactor = one thread + one io_uring + one listening socket (SO_REUSEPORT) +/// + one connection map. The reactor thread is the sole writer of the SQ ring, +/// the kernel-shared buf_ring, and the connection map. Handlers may run on any +/// thread (e.g. resumed by a thread-pool timer after `await Task.Delay(1)`); +/// they reach the reactor only through two MPSC queues (`_returnQ`, `_flushQ`) +/// woken by an `eventfd` registered as a multishot poll in the ring. +/// +public sealed unsafe partial class Reactor +{ + public readonly int Id; + public Ring Ring = null!; // created on the reactor's own thread (DEFER_TASKRUN requires same-thread setup+enter) + public readonly Dictionary Connections = new(); + + private int _listenFd; + private readonly ServerConfig _config; + private readonly ushort _port; + private readonly uint _ringEntries; + private readonly bool _incremental; + private readonly uint RecvBufferSize; + + // CQE user_data layout: kind tag in the high 32 bits, fd in the low 32. + private const ulong KindAccept = 1UL << 32; + private const ulong KindRecv = 2UL << 32; + private const ulong KindSend = 3UL << 32; + private const ulong KindWake = 4UL << 32; // eventfd-based cross-thread wake + + // Provided-buffer ring (one per reactor, shared by all its connections). + private const ushort BgId = 1; + private readonly uint BufferRingEntries; // power of two + private byte* _bufRing; // io_uring_buf_ring (kernel-shared) + private byte* _bufSlab; // contiguous slab of recv buffers + private uint _bufRingMask; + private ushort _bufRingTail; + + // Cross-thread wake mechanism: handlers running off-reactor enqueue work + // into these MPSC queues and `eventfd_write` _wakeFd; a multishot poll on + // _wakeFd registered with the ring delivers a CQE that wakes the reactor. + // When the caller is already the reactor thread (the common case — handler + // resumed inline from an IVTS SetResult), the Enqueue* methods bypass + // the queue and call the direct op, avoiding 2 syscalls per request. + private int _wakeFd; + private int _reactorThreadId; + private readonly Mpsc _returnQ = new(1 << 14); // 16384 slots + private readonly Mpsc _flushQ = new(1 << 12); // 4096 slots + + // Connection pool. Reactor-thread-only — accept and teardown both run on + // this reactor, so a plain Stack is sufficient (no MPMC primitive + // needed). PoolMax caps the slab footprint per reactor: + // PoolMax × WriteSlabSize × ReactorCount = total reserved native memory. + private readonly int PoolMax; + private readonly Stack _pool; + + // Incremental-mode (IOU_PBUF_RING_INC) sizing. Each connection gets its own + // ring, so reserved native memory is bounded by: + // PoolMax × ConnBufRingEntries × IncRecvBufferSize × ReactorCount. + // Keep entries small — the point of incremental is that one buffer holds + // many reads, so you need few of them per connection. + private readonly int MaxConnections; // GID cap (one bgid per active connection) + private readonly int ConnBufRingEntries; // buffers per connection ring + private readonly uint IncRecvBufferSize; // bytes per buffer (filled incrementally) + + // Transient io_uring_enter errnos (Linux): interrupted, would-block, busy. + private const int EINTR = 4; + private const int EAGAIN = 11; + private const int EBUSY = 16; + + public Reactor(int id, ServerConfig config) + { + Id = id; + _config = config; + _port = config.Port; + _ringEntries = config.RingEntries; + _incremental = config.Incremental; + RecvBufferSize = (uint)config.RecvBufferSize; + BufferRingEntries = (uint)config.BufferRingEntries; + PoolMax = config.PoolMax; + MaxConnections = config.MaxConnections; + ConnBufRingEntries = config.ConnBufRingEntries; + IncRecvBufferSize = (uint)config.IncRecvBufferSize; + _pool = new Stack(config.PoolMax); + } + + // ========================================================================= + // Buffer ring + // ========================================================================= + + private void InitBufferRing() + { + nuint ringBytes = (nuint)BufferRingEntries * 16; + _bufRing = (byte*)NativeMemory.AlignedAlloc(ringBytes, 4096); + NativeMemory.Clear(_bufRing, ringBytes); + + nuint slabBytes = BufferRingEntries * (nuint)RecvBufferSize; + _bufSlab = (byte*)NativeMemory.AlignedAlloc(slabBytes, 64); + + _bufRingMask = BufferRingEntries - 1; + + var reg = new io_uring_buf_reg { + ring_addr = (ulong)_bufRing, + ring_entries = BufferRingEntries, + bgid = BgId, + }; + + int ret = io_uring_register(Ring.Fd, IORING_REGISTER_PBUF_RING, ®, 1); + if (ret < 0) + { + int err = Marshal.GetLastPInvokeError(); + + throw new InvalidOperationException($"register pbuf_ring failed: ret={ret} errno={err}"); + } + + // Populate every slot once. Slot 0 overlaps with the ring's tail field + // at offset 14, but we only write addr/len/bid (offsets 0..13) so tail + // stays at zero until we set it explicitly. + for (ushort bid = 0; bid < BufferRingEntries; bid++) { + byte* slot = _bufRing + (uint)bid * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)RecvBufferSize); + *(uint*)(slot + 8) = RecvBufferSize; + *(ushort*)(slot + 12) = bid; + } + _bufRingTail = (ushort)BufferRingEntries; + + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + + // Reactor-thread-only: writes the kernel-shared buf_ring tail directly. + // Off-reactor callers must use EnqueueReturnQ instead. + internal void ReturnBufferDirect(ushort bid) + { + byte* slot = _bufRing + (_bufRingTail & _bufRingMask) * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)RecvBufferSize); + *(uint*)(slot + 8) = RecvBufferSize; + *(ushort*)(slot + 12) = bid; + _bufRingTail++; + + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + + // ========================================================================= + // Cross-thread entry points (safe to call from any thread) + // ========================================================================= + + public void EnqueueReturnQ(ushort bid) + { + // Fast path: caller is the reactor thread (handler running inline from + // an IVTS SetResult). Go straight to the buf_ring — no queue, no syscall. + if (Environment.CurrentManagedThreadId == _reactorThreadId) + { + ReturnBufferDirect(bid); + return; + } + SpinWait sw = default; + while (!_returnQ.TryEnqueue(bid)) + { + sw.SpinOnce(); + } + WakeFdWrite(); + } + + internal void EnqueueFlush(int fd) + { + // Fast path: caller is the reactor thread; write the SQE directly. + if (Environment.CurrentManagedThreadId == _reactorThreadId) + { + if (Connections.TryGetValue(fd, out var conn)) + { + SubmitSend(fd, conn.WriteBuffer, (uint)conn.WriteInFlight); + } + return; + } + SpinWait sw = default; + while (!_flushQ.TryEnqueue(fd)) + { + sw.SpinOnce(); + } + WakeFdWrite(); + } + + private void WakeFdWrite() + { + ulong v = 1; + // 8-byte write to eventfd increments its counter; the kernel marks the + // fd readable, which fires our registered multishot poll's next CQE. + write(_wakeFd, &v, 8); + } + + private void DrainReturnQ() + { + while (_returnQ.TryDequeue(out ushort bid)) + { + ReturnBufferDirect(bid); + } + } + + private void DrainFlushQ() + { + while (_flushQ.TryDequeue(out int fd)) + { + if (!Connections.TryGetValue(fd, out var conn)) + { + continue; + } + // Connection state was set by FlushAsync; the Enqueue/Dequeue pair + // establishes the happens-before so WriteInFlight is visible here. + SubmitSend(fd, conn.WriteBuffer, (uint)conn.WriteInFlight); + } + } + + private void ArmWakePoll() + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_POLL_ADD; + sqe->fd = _wakeFd; + sqe->op_flags = POLLIN; // poll32_events lives at this offset + sqe->len = IORING_POLL_ADD_MULTI; // multishot — stays armed across CQEs + sqe->user_data = KindWake | (uint)_wakeFd; + } + + // ========================================================================= + // Main loop + // ========================================================================= + + public void Run() + { + _reactorThreadId = Environment.CurrentManagedThreadId; + + Ring = Ring.Create(_ringEntries); + _listenFd = OpenReusePortListener(_port); + + if (_incremental) + { + InitIncremental(); + } + else + { + InitBufferRing(); + } + + _wakeFd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (_wakeFd < 0) + { + throw new InvalidOperationException("eventfd failed"); + } + + Console.WriteLine($"[r{Id}] listening on 0.0.0.0:{_port} (incremental={_incremental})"); + SubmitAcceptMultishot(); + ArmWakePoll(); + + if (_incremental) + { + LoopIncremental(); + } + else + { + LoopShared(); + } + + close(_listenFd); + close(_wakeFd); + Ring.Dispose(); + } + + private void LoopShared() + { + while (true) + { + // Drain MPSC queues from off-reactor handlers. Cheap when empty. + DrainReturnQ(); + DrainFlushQ(); + + int rc = Ring.SubmitAndWait(1); + if (rc < 0 && rc != -EINTR && rc != -EAGAIN && rc != -EBUSY) + { + Console.Error.WriteLine($"[r{Id}] io_uring_enter failed: {rc}"); + break; + } + + uint ready = Ring.CqReady(); + for (uint i = 0; i < ready; i++) + { + Dispatch(in Ring.CqeAt(i)); + } + Ring.CqAdvance(ready); + } + } + + private void Dispatch(in IoUringCqe cqe) + { + ulong kind = cqe.user_data & 0xffffffff_00000000UL; + int fd = (int)(cqe.user_data & 0xffffffffUL); + bool more = (cqe.flags & IORING_CQE_F_MORE) != 0; + + if (kind == KindWake) + { + // Drain the eventfd counter so the next write re-triggers POLLIN + // (multishot poll is edge-triggered on the user_space side). + ulong drain; + read(_wakeFd, &drain, 8); + // The actual queue drains happen at the top of the next loop + // iteration — nothing else to do here. + if (!more) + { + ArmWakePoll(); + } + return; + } + + if (kind == KindAccept) + { + if (cqe.res >= 0) + { + int clientFd = cqe.res; + SetNoDelay(clientFd); + Connection conn = _pool.TryPop(out var pooled) + ? pooled.SetFd(clientFd) + : new Connection(this, clientFd, _config.WriteSlabSize); + Connections[clientFd] = conn; + SubmitRecvMultishot(clientFd); + + _ = _config.UsePipe + ? Handler.HandlePipeAsync(this, conn) + : Handler.HandleAsync(this, conn); + } + else + { + Console.Error.WriteLine($"[r{Id}] accept error: {cqe.res}"); + } + // Multishot accept stays armed; only re-arm if the kernel terminated it. + if (!more) + { + SubmitAcceptMultishot(); + } + } + else if (kind == KindRecv) + { + bool hasBuf = (cqe.flags & IORING_CQE_F_BUFFER) != 0; + ushort bid = hasBuf ? (ushort)(cqe.flags >> IORING_CQE_BUFFER_SHIFT) : (ushort)0; + + if (cqe.res <= 0) + { + // Peer EOF or recv error — reactor owns teardown. + if (hasBuf) + { + ReturnBufferDirect(bid); + } + if (Connections.Remove(fd, out var dyingConn)) + { + Recycle(dyingConn, fd); + } + return; + } + + if (!Connections.TryGetValue(fd, out var conn)) + { + // Straggler buffer for an already-closed connection. + if (hasBuf) + { + ReturnBufferDirect(bid); + } + return; + } + + byte* ptr = hasBuf ? _bufSlab + (nuint)bid * (nuint)RecvBufferSize : null; + conn.Complete(cqe.res, bid, hasBuf, ptr); + + if (!more) + { + SubmitRecvMultishot(fd); + } + } + else if (kind == KindSend) + { + if (!Connections.TryGetValue(fd, out var conn)) + { + return; + } + if (cqe.res <= 0) + { + // Send error — reactor owns teardown. + Connections.Remove(fd); + Recycle(conn, fd); + return; + } + conn.WriteHead += cqe.res; + if (conn.WriteHead < conn.WriteInFlight) + { + // Partial send: resubmit the remainder. + SubmitSend(fd, conn.WriteBuffer + conn.WriteHead, (uint)(conn.WriteInFlight - conn.WriteHead)); + return; + } + // Full target ack'd — resets buffer state and signals the awaiter. + conn.CompleteFlush(); + } + } + + // ========================================================================= + // SQE producers (reactor-thread-only — Connection.FlushAsync hands off via + // EnqueueFlush, which DrainFlushQ turns into SubmitSend on this thread) + // ========================================================================= + + private IoUringSqe* GetSqeOrFlush() + { + IoUringSqe* sqe = Ring.GetSqe(); + if (sqe != null) + { + return sqe; + } + + Ring.SubmitAndWait(0); + sqe = Ring.GetSqe(); + + if (sqe == null) + { + throw new InvalidOperationException("SQ full after flush"); + } + + return sqe; + } + + private void SubmitAcceptMultishot() + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_ACCEPT; + sqe->ioprio = IORING_ACCEPT_MULTISHOT; + sqe->fd = _listenFd; + sqe->user_data = KindAccept | (uint)_listenFd; + } + + private void SubmitRecvMultishot(int fd) => SubmitRecvMultishot(fd, BgId); + + private void SubmitRecvMultishot(int fd, ushort bgid) + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_RECV; + sqe->flags = IOSQE_BUFFER_SELECT; + sqe->ioprio = IORING_RECV_MULTISHOT; + sqe->fd = fd; + sqe->buf_index = bgid; // buffer-group id (shared BgId, or per-conn in incremental) + sqe->user_data = KindRecv | (uint)fd; + } + + private void SubmitSend(int fd, byte* buf, uint len) + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_SEND; + sqe->fd = fd; + sqe->addr = (ulong)buf; + sqe->len = len; + sqe->user_data = KindSend | (uint)fd; + } + + private void Recycle(Connection conn, int fd) + { + // Wake awaiters, drain in-flight buffers, close the fd, reset state, + // and either push the Connection back to the pool or free its native + // WriteBuffer if the pool is full. + conn.MarkClosed(); + if (_incremental) + { + // The per-connection ring is freed wholesale; no per-buffer return. + // Clear() empties the SPSC ring (leftover slices discarded). + TeardownConnectionBufRing(conn); + } + else + { + conn.DrainRecv(); // return leftover buffers to the shared ring + } + close(fd); + conn.Clear(); + + if (_pool.Count < PoolMax) + { + _pool.Push(conn); + } + else + { + conn.Dispose(); + } + } + + // Disable Nagle on an accepted connection. Must be set per-accepted-socket, + // not on the listener — TCP_NODELAY doesn't reliably inherit across accept, + // which is why zerg/terraform/rtr all set it on the client fd, not the listener. + private static void SetNoDelay(int fd) + { + int one = 1; + setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(int)); + } + + private static int OpenReusePortListener(ushort port) + { + int fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) + { + throw new InvalidOperationException($"socket failed: {fd}"); + } + + int one = 1; + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(int)); + setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(int)); + + sockaddr_in addr = default; + addr.sin_family = AF_INET; + addr.sin_port = Htons(port); + addr.sin_addr.s_addr = 0; // 0.0.0.0 + + if (bind(fd, &addr, (uint)sizeof(sockaddr_in)) < 0) + { + throw new InvalidOperationException("bind failed"); + } + + if (listen(fd, 128) < 0) + { + throw new InvalidOperationException("listen failed"); + } + + return fd; + } +} diff --git a/Minima/ServerConfig.cs b/Minima/ServerConfig.cs new file mode 100644 index 0000000..e05b5f5 --- /dev/null +++ b/Minima/ServerConfig.cs @@ -0,0 +1,35 @@ +namespace Minima; + +/// +/// All server tunables in one place — replaces the consts that used to be +/// scattered across Program.cs and Reactor.cs. Defaults match the previous +/// hardcoded values; override via object initializer in Main, e.g.: +/// new ServerConfig { Port = 9000, ReactorCount = 8, Incremental = true }. +/// +public sealed record ServerConfig +{ + // Server-level. + public ushort Port { get; init; } = 8080; + public int ReactorCount { get; init; } = 12; + + // Handler style: false = raw ReadAsync/TryGetItem loop; true = PipeReader/PipeWriter. + public bool UsePipe { get; init; } = false; + + // io_uring SQ/CQ depth. + public uint RingEntries { get; init; } = 8192; + + // Shared buffer ring (used when Incremental == false). + public int RecvBufferSize { get; init; } = 32 * 1024; + public int BufferRingEntries { get; init; } = 4096; + + // Per-connection write slab + connection pool cap. + public int WriteSlabSize { get; init; } = 16 * 1024; + public int PoolMax { get; init; } = 1024; + + // Incremental mode (IOU_PBUF_RING_INC) — per-connection rings. + // reserved native memory ≈ PoolMax × ConnBufRingEntries × IncRecvBufferSize × ReactorCount. + public bool Incremental { get; init; } = false; + public int MaxConnections { get; init; } = 4096; // GID cap (one bgid per active connection) + public int ConnBufRingEntries { get; init; } = 16; // buffers per connection ring + public int IncRecvBufferSize { get; init; } = 4096; // bytes per buffer (filled incrementally) +} diff --git a/Minima/Utils/Mpsc.cs b/Minima/Utils/Mpsc.cs new file mode 100644 index 0000000..78e445a --- /dev/null +++ b/Minima/Utils/Mpsc.cs @@ -0,0 +1,115 @@ +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Minima.Utils; + +/// +/// Bounded lock-free multi-producer / single-consumer queue. +/// +/// Dmitry Vyukov's bounded MPMC algorithm, specialised to one consumer. +/// Power-of-two capacity, zero-allocation after construction. Producers claim a +/// slot via CAS on the enqueue position (a failed TryEnqueue on a full queue +/// leaves the position untouched — no burned tickets); the single consumer +/// advances the dequeue position with a plain write. Each slot carries a +/// sequence number that coordinates ownership between producers and consumer. +/// +/// One generic queue serves every reactor handoff: Mpsc<ushort> for buffer +/// returns, Mpsc<int> for flush fds, Mpsc<ulong> for packed incremental +/// returns. T is unmanaged so each Cell is a blittable value type with no GC refs. +/// +internal sealed class Mpsc where T : unmanaged +{ + private struct Cell + { + public long Sequence; + public T Value; + } + + private readonly Cell[] _buffer; + private readonly int _mask; + + // PaddedLong is a top-level struct (not nested here) because the CLR forbids + // explicit layout on a type nested inside a generic. + private PaddedLong _enqueuePos; + private PaddedLong _dequeuePos; + + public Mpsc(int capacityPow2) + { + if (capacityPow2 < 2 || (capacityPow2 & (capacityPow2 - 1)) != 0) + throw new ArgumentException("Capacity must be a power of two >= 2.", nameof(capacityPow2)); + + _buffer = new Cell[capacityPow2]; + _mask = capacityPow2 - 1; + + for (int i = 0; i < capacityPow2; i++) + _buffer[i].Sequence = i; + } + + /// Multi-producer safe. Returns false if the queue is full. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryEnqueue(T item) + { + Cell[] buffer = _buffer; + int mask = _mask; + + while (true) + { + long pos = Volatile.Read(ref _enqueuePos.Value); + ref Cell cell = ref buffer[(int)pos & mask]; + + long seq = Volatile.Read(ref cell.Sequence); + long dif = seq - pos; + + if (dif == 0) + { + if (Interlocked.CompareExchange(ref _enqueuePos.Value, pos + 1, pos) == pos) + { + cell.Value = item; + Volatile.Write(ref cell.Sequence, pos + 1); + return true; + } + continue; // lost the race; reload and retry + } + + if (dif < 0) + return false; // slot not yet consumed → full + } + } + + /// Single-consumer only. Returns false if empty. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeue(out T item) + { + Cell[] buffer = _buffer; + int mask = _mask; + + long pos = _dequeuePos.Value; // single consumer: plain read + ref Cell cell = ref buffer[(int)pos & mask]; + + long seq = Volatile.Read(ref cell.Sequence); + long dif = seq - (pos + 1); + + if (dif == 0) + { + item = cell.Value; + _dequeuePos.Value = pos + 1; // single consumer: plain write + Volatile.Write(ref cell.Sequence, pos + mask + 1); // free slot for producers + return true; + } + + item = default; + return false; + } +} + +/// +/// A single long padded to a 64-byte cache line so the producer and consumer +/// positions never share a line (no false sharing). Top-level and non-generic +/// so it can legally use explicit layout. +/// +[StructLayout(LayoutKind.Explicit, Size = 64)] +internal struct PaddedLong +{ + [FieldOffset(0)] public long Value; +} diff --git a/Minima/Utils/RingSegment.cs b/Minima/Utils/RingSegment.cs new file mode 100644 index 0000000..273d76e --- /dev/null +++ b/Minima/Utils/RingSegment.cs @@ -0,0 +1,31 @@ +using System.Buffers; + +namespace Minima.Utils; + +/// +/// One segment of a multi-buffer ReadOnlySequence<byte> built by the +/// ConnectionPipeReader when a single read spans more than one recv buffer. +/// BufferId is carried for debugging; buffer return is driven off the held +/// item list, not the segments. +/// +public sealed class RingSegment : ReadOnlySequenceSegment +{ + public ushort BufferId { get; } + + public RingSegment(ReadOnlyMemory memory, ushort bufferId) + { + Memory = memory; + BufferId = bufferId; + } + + public RingSegment Append(ReadOnlyMemory memory, ushort bufferId) + { + var next = new RingSegment(memory, bufferId) + { + RunningIndex = RunningIndex + Memory.Length + }; + + Next = next; + return next; + } +} diff --git a/Minima/SpscRecvRing.cs b/Minima/Utils/SpscRecvRing.cs similarity index 83% rename from Minima/SpscRecvRing.cs rename to Minima/Utils/SpscRecvRing.cs index 7f398f2..b26642f 100644 --- a/Minima/SpscRecvRing.cs +++ b/Minima/Utils/SpscRecvRing.cs @@ -1,9 +1,10 @@ using System.Runtime.CompilerServices; + // ReSharper disable SuggestVarOrType_BuiltInTypes -namespace Minima; +namespace Minima.Utils; -internal sealed unsafe class SpscRecvRing +public sealed unsafe class SpscRecvRing { public struct Item { @@ -11,6 +12,7 @@ public struct Item public ushort Bid; public int Len; public bool HasBuffer; + public ushort Gen; // connection generation when enqueued (incremental return guard) public ReadOnlySpan AsSpan() => new(Ptr, Len); @@ -90,4 +92,14 @@ public bool TryDequeueUntil(long tailSnapshot, out Item item) [MethodImpl(MethodImplOptions.AggressiveInlining)] public bool IsEmpty() => Volatile.Read(ref _head) >= Volatile.Read(ref _tail); + + // Reactor-thread-only, called during connection teardown (Clear) when no + // handler is consuming. Discards any leftover items so the recycled + // connection starts empty. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Reset() + { + _head = 0; + _tail = 0; + } } diff --git a/Minima/UnmanagedMemoryManager.cs b/Minima/Utils/UnmanagedMemoryManager.cs similarity index 71% rename from Minima/UnmanagedMemoryManager.cs rename to Minima/Utils/UnmanagedMemoryManager.cs index 75aecec..994202c 100644 --- a/Minima/UnmanagedMemoryManager.cs +++ b/Minima/Utils/UnmanagedMemoryManager.cs @@ -1,13 +1,19 @@ using System.Buffers; -namespace Minima; +namespace Minima.Utils; -internal sealed unsafe class UnmanagedMemoryManager : MemoryManager +public sealed unsafe class UnmanagedMemoryManager : MemoryManager { private readonly byte* _ptr; private readonly int _length; public ushort BufferId { get; } + + public UnmanagedMemoryManager(byte* ptr, int length) + { + _ptr = ptr; + _length = length; + } public UnmanagedMemoryManager(byte* ptr, int length, ushort bufferId) { diff --git a/Minima/Native.cs b/Minima/io_uring/Native.cs similarity index 83% rename from Minima/Native.cs rename to Minima/io_uring/Native.cs index 13ae32c..61d9801 100644 --- a/Minima/Native.cs +++ b/Minima/io_uring/Native.cs @@ -7,11 +7,12 @@ namespace Minima; /// the kernel struct layouts they expect, and the constants needed to /// drive a minimal io_uring loop. /// -internal static unsafe class Native { +public static unsafe class Native { private const long SYS_IO_URING_SETUP = 425; private const long SYS_IO_URING_ENTER = 426; private const long SYS_IO_URING_REGISTER = 427; + public const byte IORING_OP_POLL_ADD = 6; public const byte IORING_OP_ACCEPT = 13; public const byte IORING_OP_SEND = 26; public const byte IORING_OP_RECV = 27; @@ -26,7 +27,20 @@ internal static unsafe class Native { public const uint IORING_CQE_F_BUFFER = 1u << 0; public const uint IORING_CQE_F_MORE = 1u << 1; public const int IORING_CQE_BUFFER_SHIFT = 16; - public const uint IORING_REGISTER_PBUF_RING = 22; + public const uint IORING_REGISTER_PBUF_RING = 22; + public const uint IORING_UNREGISTER_PBUF_RING = 23; + public const uint IORING_POLL_ADD_MULTI = 1u << 0; + + // Incremental provided-buffer consumption (kernel 6.12+). IOU_PBUF_RING_INC + // is set in io_uring_buf_reg.flags at registration; IORING_CQE_F_BUF_MORE is + // set on recv CQEs while the kernel will keep appending to the same buffer. + public const ushort IOU_PBUF_RING_INC = 2; + public const uint IORING_CQE_F_BUF_MORE = 1u << 4; + + // eventfd flags + poll mask (used for the cross-thread wake mechanism). + public const int EFD_CLOEXEC = 0x80000; + public const int EFD_NONBLOCK = 0x800; + public const uint POLLIN = 0x0001; // Setup flags. SINGLE_ISSUER tells the kernel only one thread will submit // to this ring (skips locking on the SQ). DEFER_TASKRUN defers completion @@ -45,6 +59,8 @@ internal static unsafe class Native { public const int SOL_SOCKET = 1; public const int SO_REUSEADDR = 2; public const int SO_REUSEPORT = 15; + public const int IPPROTO_TCP = 6; + public const int TCP_NODELAY = 1; [DllImport("libc", EntryPoint = "syscall")] private static extern long syscall3(long nr, uint a1, IoUringParams* a2); @@ -71,6 +87,9 @@ public static int io_uring_register(int fd, uint opcode, void* arg, uint nrArgs) [DllImport("libc")] public static extern int bind(int fd, sockaddr_in* addr, uint len); [DllImport("libc")] public static extern int listen(int fd, int backlog); [DllImport("libc")] public static extern int setsockopt(int fd, int level, int optname, void* optval, uint optlen); + [DllImport("libc")] public static extern int eventfd(uint initval, int flags); + [DllImport("libc")] public static extern long write(int fd, void* buf, nuint count); + [DllImport("libc")] public static extern long read(int fd, void* buf, nuint count); public static ushort Htons(ushort x) => (ushort)((x << 8) | (x >> 8)); diff --git a/Minima/Ring.cs b/Minima/io_uring/Ring.cs similarity index 87% rename from Minima/Ring.cs rename to Minima/io_uring/Ring.cs index 748a886..c40040f 100644 --- a/Minima/Ring.cs +++ b/Minima/io_uring/Ring.cs @@ -7,7 +7,7 @@ namespace Minima; -internal sealed unsafe class Ring : IDisposable +public sealed unsafe class Ring : IDisposable { private int _fd; @@ -132,16 +132,31 @@ public bool TryGetCqe(out IoUringCqe cqe) if (head == tail) { cqe = default; + return false; } cqe = _cqes[head & _cqMask]; + return true; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public void CqeSeen() => Volatile.Write(ref *_cqHead, *_cqHead + 1); + // Batched CQ drain (liburing io_uring_for_each_cqe + io_uring_cq_advance): + // read the kernel-written tail once (acquire), process the whole batch, + // then publish the consumed head once (release) instead of once per CQE. + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint CqReady() => Volatile.Read(ref *_cqTail) - *_cqHead; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ref readonly IoUringCqe CqeAt(uint i) => ref _cqes[(*_cqHead + i) & _cqMask]; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void CqAdvance(uint n) => Volatile.Write(ref *_cqHead, *_cqHead + n); + public void Dispose() { if (_ringPtr != null) diff --git a/Shrike.Playground/Program.cs b/Shrike.Playground/Program.cs new file mode 100644 index 0000000..20f219c --- /dev/null +++ b/Shrike.Playground/Program.cs @@ -0,0 +1,73 @@ +// ReSharper disable always SuggestVarOrType_BuiltInTypes +using System.Runtime.CompilerServices; +using Shrike; + +[SkipLocalsInit] +internal static class Program +{ + public static void Main() + { + var engine = ShrikeEngine + .CreateBuilder() + .SetNWorkersSolver(() => 12) + .SetBacklog(16384) + .SetMaxEventsPerWake(512) + .SetMaxNumberConnectionsPerWorker(512) + .SetPort(8080) + .SetSlabSizes(512 * 1024, 128 * 1024) + .InjectHandler(HandleAsync); + + engine.Build().Run(); + } + + /// + /// The per-connection handler — Minima-style. The handler owns the request + /// lifecycle through the connection's IVTS-backed read/flush: + /// await ReadAsync → wait for data (suspends until the worker recv's) + /// TryReadRequest → parse each complete request from the recv window + /// write response → into the connection's WriteBuffer + /// await FlushAsync → send (suspends if the socket back-pressures, EPOLLOUT) + /// Runs inline on the worker thread, so it's a single-threaded cooperative loop. + /// + private static async Task HandleAsync(Connection conn) + { + while (true) + { + if (await conn.ReadAsync()) // true => peer closed + return; + + bool wrote = false; + while (conn.TryReadRequest()) // one iteration per complete request + { + CommitPlainTextResponse(conn); + conn.Clear(); + wrote = true; + } + + if (wrote) + await conn.FlushAsync(); + } + } + + private static ReadOnlySpan s_plainTextBody => "Hello, World!"u8; + + private static unsafe void CommitPlainTextResponse(Connection connection) + { + int tail = connection.WriteBuffer.Tail; + int contentLength = s_plainTextBody.Length; + + connection.WriteBuffer.WriteUnmanaged("HTTP/1.1 200 OK\r\n"u8 + + "Content-Length: \r\n"u8 + + "Server: S\r\n"u8 + + "Content-Type: text/plain\r\n"u8); + connection.WriteBuffer.WriteUnmanaged(DateHelper.HeaderBytes); + connection.WriteBuffer.WriteUnmanaged(s_plainTextBody); + + // Patch the 2-digit Content-Length into the reserved spaces (offset matches the header above). + byte* dst = connection.WriteBuffer.Ptr + tail + 33; + int tens = contentLength / 10; + int ones = contentLength - tens * 10; + dst[0] = (byte)('0' + tens); + dst[1] = (byte)('0' + ones); + } +} diff --git a/Shrike.Playground/Shrike.Playground.csproj b/Shrike.Playground/Shrike.Playground.csproj new file mode 100644 index 0000000..b396691 --- /dev/null +++ b/Shrike.Playground/Shrike.Playground.csproj @@ -0,0 +1,17 @@ + + + + Exe + net10.0 + enable + enable + true + true + true + + + + + + + diff --git a/Shrike/ABI/Native.cs b/Shrike/ABI/Native.cs new file mode 100644 index 0000000..92944f9 --- /dev/null +++ b/Shrike/ABI/Native.cs @@ -0,0 +1,297 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// (var is avoided intentionally in this project so that concrete types are visible at call sites.) +// ReSharper disable always StackAllocInsideLoop +// ReSharper disable always ClassCannotBeInstantiated +#pragma warning disable CA2014 + +namespace Shrike; + +/// +/// Linux interop surface for a high-performance, epoll-driven TCP server. +/// +/// Design goals: +/// - **Minimal marshaling overhead**: prefer blittable types (e.g., pointers, ints). +/// - **Explicit error handling**: all functions are marked . +/// Use Marshal.GetLastPInvokeError() immediately after a failure to read errno. +/// - **Unsafe-friendly**: exposes pointer overloads for zero-copy recv/send. +/// +/// Platform notes: +/// - Constants can differ across libc/architectures/kernels. The values here target +/// mainstream Linux/glibc on x86_64. If you target other distros/architectures, verify +/// these values against system headers (bits/socket.h, fcntl.h, sys/epoll.h, sys/eventfd.h). +/// - Network byte order: ports must be big-endian (use htons); addresses must be set appropriately. +/// - SIGPIPE: either ignore SIGPIPE process-wide or pass to send. +/// +internal static unsafe class Native +{ + // ========================= + // P/Invoke + // ========================= + + /// + /// Create a socket. Typically domain=AF_INET, type=SOCK_STREAM, protocol=IPPROTO_TCP. + /// Returns a file descriptor (>= 0) on success, or -1 on error (check errno). + /// + [DllImport("libc", SetLastError = true)] internal static extern int socket(int domain, int type, int protocol); + + /// + /// Bind a socket to an address/port. Use for IPv4. + /// Returns 0 on success, -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int bind(int sockfd, ref sockaddr_in addr, uint addrlen); + + /// + /// Mark a bound socket as passive (accept incoming connections). + /// is the kernel queue length hint. + /// Returns 0 on success, -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int listen(int sockfd, int backlog); + + /// + /// Accept a new connection. flags can include and + /// to atomically configure the accepted FD. Returns new client FD or -1 on error. + /// Use Marshal.GetLastPInvokeError() to check for / in edge-triggered loops. + /// + [DllImport("libc", SetLastError = true)] internal static extern int accept4(int sockfd, IntPtr addr, IntPtr addrlen, int flags); + + /// + /// Set a socket option (int value). Common options: , TCP_NODELAY, etc. + /// Returns 0 on success, -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int setsockopt(int sockfd, int level, int optname, ref int optval, uint optlen); + + /// + /// Set SO_LINGER using struct. + /// Returns 0 on success, -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int setsockopt(int sockfd, int level, int optname, ref Linger optval, uint optlen); + + /// + /// File control. Typical usage: get/set O_NONBLOCK on a socket. + /// Returns result per command, or -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int fcntl(int fd, int cmd, int arg); + + /// + /// Close a file descriptor (socket or epoll/eventfd). Returns 0 on success, -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int close(int fd); + + /// + /// Read from a file descriptor into unmanaged memory. + /// For sockets, prefer . + /// Returns bytes read (>=0) or -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern long read(int fd, IntPtr buf, ulong count); + + /// + /// Write to a file descriptor from unmanaged memory. + /// For sockets, prefer . + /// Returns bytes written (>=0) or -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern long write(int fd, IntPtr buf, ulong count); + + /// + /// Receive from a socket into unmanaged memory. Returns bytes received (>=0), 0 on orderly shutdown, or -1 on error. + /// Set flags to 0 for normal reads. + /// + [DllImport("libc", SetLastError = true)] internal static extern long recv(int sockfd, IntPtr buf, ulong len, int flags); + + /// + /// Receive from a socket into a raw pointer. Equivalent to the IntPtr overload, but avoids extra pinning overhead when you already have a pointer. + /// + [DllImport("libc", SetLastError = true)] internal static extern long recv(int sockfd, byte* buf, ulong len, int flags); + + /// + /// Send to a socket from unmanaged memory. Returns bytes sent (>=0) or -1 on error. + /// Consider passing in flags to avoid SIGPIPE on closed peers. + /// + [DllImport("libc", SetLastError = true)] internal static extern long send(int sockfd, IntPtr buf, ulong len, int flags); + + /// + /// Send to a socket from a raw pointer (long length). + /// + [DllImport("libc", SetLastError = true)] internal static extern long send(int sockfd, byte* buf, long len, int flags); + + /// + /// Send to a socket from a raw void* and nuint length. + /// This signature maps closely to the native prototype and can reduce marshaling overhead in hot paths. + /// + [DllImport("libc", SetLastError = true)] public static extern nint send(int sockfd, void* buf, nuint len, int flags); + + /// + /// Create an epoll instance. Returns an epoll file descriptor (>=0) or -1 on error. + /// Use to set close-on-exec at creation time. + /// + [DllImport("libc", SetLastError = true)] internal static extern int epoll_create1(int flags); + + /// + /// Control the epoll interest list (add/mod/del). The ev points to an epoll_event struct in unmanaged memory. + /// Returns 0 on success, -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int epoll_ctl(int epfd, int op, int fd, IntPtr ev); + + /// + /// Wait for events. events points to a contiguous array of epoll_event (maxevents elements). + /// Returns number of events (>=0) or -1 on error. Use timeout < 0 to block indefinitely. + /// + [DllImport("libc", SetLastError = true)] internal static extern int epoll_wait(int epfd, IntPtr events, int maxevents, int timeout); + + /// + /// Create an eventfd (userspace semaphore/notification). Great for waking worker threads from another thread. + /// Returns fd (>=0) or -1 on error. + /// + [DllImport("libc", SetLastError = true)] internal static extern int eventfd(uint initval, int flags); + + [DllImport("libc", SetLastError = true)] internal static extern int sched_setaffinity(int pid, IntPtr cpusetsize, ref ulong mask); + + [DllImport("libc", SetLastError = true)] internal static extern int sched_setaffinity(int pid, IntPtr cpusetsize, ref cpu_set_t mask); + + [DllImport("libc")] internal static extern int gettid(); // Linux thread id + + // ========================= + // Struct definitions + // ========================= + + /// + /// IPv4 address (network byte order). + /// + [StructLayout(LayoutKind.Sequential)] + internal struct in_addr + { + /// + /// Address in network byte order (big-endian). 0 == INADDR_ANY. + /// + public uint s_addr; + } + + /// + /// IPv4 socket address. Must be passed with addrlen = (uint)sizeof(sockaddr_in). + /// + [StructLayout(LayoutKind.Sequential)] + internal struct sockaddr_in + { + /// Address family (AF_INET). + public ushort sin_family; + + /// Port in network byte order (use htons). + public ushort sin_port; + + /// IPv4 address (use INADDR_ANY or a specific address in network byte order). + public in_addr sin_addr; + + /// + /// Padding to match native layout (8 bytes). Must be present for correct size. + /// It need not be initialized for normal usage; the kernel ignores it. + /// + [MarshalAs(UnmanagedType.ByValArray, SizeConst = 8)] + public byte[] sin_zero; + } + + /// + /// linger option for SO_LINGER. + /// If l_onoff != 0, close() will block up to l_linger seconds to flush pending data. + /// Be careful: enabling linger can cause unexpected blocking on close. + /// + [StructLayout(LayoutKind.Sequential)] + internal struct Linger + { + public int l_onoff; + public int l_linger; + } + + + // ========================= + // Constants + // ========================= + // Socket families/types/protocols + internal const int AF_INET = 2; + internal const int SOCK_STREAM = 1; + internal const int IPPROTO_TCP = 6; + + // setsockopt levels / names + internal const int SOL_SOCKET = 1; + internal const int SO_REUSEADDR = 2; + internal const int SO_REUSEPORT = 15; + internal const int SO_LINGER = 13; + /// + /// TCP_NODELAY (disable Nagle). Linux defines this at level IPPROTO_TCP with optname=1. + /// (Kept here as constant=1; use level=IPPROTO_TCP when calling setsockopt.) + /// + internal const int TCP_NODELAY = 1; + + // fcntl / file status flags + internal const int O_NONBLOCK = 0x800; // Verify per-arch. + internal const int F_GETFL = 3; + internal const int F_SETFL = 4; + + // epoll events + internal const int EPOLLIN = 0x001; + internal const int EPOLLOUT = 0x004; + internal const int EPOLLERR = 0x008; + internal const int EPOLLHUP = 0x010; + internal const int EPOLLRDHUP = 0x2000; + internal const uint EPOLLET = 0x80000000; + internal const uint EPOLLONESHOT = 0x40000000; + + // epoll_ctl ops + internal const int EPOLL_CTL_ADD = 1; + internal const int EPOLL_CTL_DEL = 2; + internal const int EPOLL_CTL_MOD = 3; + + // CLOEXEC / NONBLOCK flags (creation-time) + /// Close-on-exec for epoll_create1/eventfd. (Verify on your target kernel/arch.) + internal const int EPOLL_CLOEXEC = 0x80000; + + /// + /// On many Linux systems, SOCK_CLOEXEC is 0x1000000 (not 0x80000). + /// Validate this constant on your target platform if you pass it to socket() or accept4(). + /// + internal const int SOCK_CLOEXEC = 0x80000; + + /// Creation-time nonblocking for socket/accept4. + internal const int SOCK_NONBLOCK = 0x800; + + // eventfd flags + internal const int EFD_NONBLOCK = 0x800; + internal const int EFD_CLOEXEC = 0x80000; + + // send/recv flags + /// + /// Suppress SIGPIPE on send. Alternatively, ignore SIGPIPE process-wide. + /// + internal const int MSG_NOSIGNAL = 0x4000; + + // Common errno values we branch on in tight loops + internal const int EINTR = 4; + internal const int EAGAIN = 11; + internal const int EWOULDBLOCK = 11; + internal const int EPIPE = 32; + internal const int ECONNABORTED = 103; + internal const int ECONNRESET = 104; + + public static void PinCurrentThreadToCpu(int cpuIndex) + { + if (cpuIndex < 0 || cpuIndex >= Environment.ProcessorCount) + throw new ArgumentOutOfRangeException(nameof(cpuIndex)); + + unsafe + { + var set = new cpu_set_t(); + int word = cpuIndex / 64; + int bit = cpuIndex % 64; + set.Bits[word] = 1UL << bit; + + int tid = gettid(); + int ret = sched_setaffinity(tid, (IntPtr)sizeof(cpu_set_t), ref set); + if (ret != 0) + throw new InvalidOperationException($"sched_setaffinity failed with errno {Marshal.GetLastPInvokeError()}"); + } + } +} + +internal unsafe struct cpu_set_t +{ + public fixed ulong Bits[16]; // 1024 bits (enough for up to 1024 CPUs) +} \ No newline at end of file diff --git a/Shrike/ABI/ProcessorArchDependant.cs b/Shrike/ABI/ProcessorArchDependant.cs new file mode 100644 index 0000000..c8a712b --- /dev/null +++ b/Shrike/ABI/ProcessorArchDependant.cs @@ -0,0 +1,149 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// (var is avoided intentionally in this project so that concrete types are visible at call sites.) +// ReSharper disable always StackAllocInsideLoop +// ReSharper disable always ClassCannotBeInstantiated +#pragma warning disable CA2014 + +namespace Shrike; + +/// +/// Provides architecture-dependent helpers for low-level socket and epoll interop. +/// +/// +/// Linux’s struct epoll_event has different binary layouts depending on CPU architecture +/// (notably 12 bytes on x86/x64 and 16 bytes on most other architectures like ARM/ARM64). +/// This class exposes constants and helpers to correctly read and write those structures +/// at runtime based on the process architecture. +/// +/// +/// +/// These methods are used to serialize and deserialize epoll_event structures directly +/// into unmanaged buffers when interfacing with epoll_wait, epoll_ctl, and related syscalls. +/// +/// +internal static unsafe class ProcessorArchDependant +{ + // ============================================================================================= + // Architecture-dependent configuration + // ============================================================================================= + + /// + /// Indicates whether the current platform uses a packed epoll_event layout (12 bytes). + /// + /// On x86 and x64 (little-endian), the epoll_event structure is packed to 12 bytes. + /// On ARM, ARM64, and others, it uses natural 8-byte alignment, resulting in 16 bytes. + /// + /// + internal static readonly bool Packed = + RuntimeInformation.ProcessArchitecture == Architecture.X64 || + RuntimeInformation.ProcessArchitecture == Architecture.X86; + + /// + /// The size (in bytes) of an epoll_event structure for the current runtime architecture. + /// + /// Typically 12 bytes for packed x86/x64 layouts and 16 for natural alignment layouts. + /// + /// + internal static readonly int EvSize = Packed ? 12 : 16; + + // ============================================================================================= + // Struct read/write helpers + // ============================================================================================= + + /// + /// Writes a Linux epoll_event structure into a preallocated unmanaged memory region. + /// + /// Destination pointer to write the structure into. + /// Bitmask of epoll events (e.g. EPOLLIN, EPOLLOUT, EPOLLRDHUP, etc.). + /// The file descriptor associated with the event. + /// + /// + /// Layouts by architecture: + /// + /// Packed (x86/x64): events @ 0 (4 bytes), data @ 4 (8 bytes) + /// Natural (ARM/others): events @ 0 (4 bytes), padding 4, data @ 8 (8 bytes) + /// + /// + /// Only the lower 32 bits of are stored in the data field. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void WriteEpollEvent(void* dest, uint events, int fd) + { + if (Packed) + { + // events @0 (4 bytes), data @4 (8 bytes) + *(uint*)dest = events; + *(ulong*)((byte*)dest + 4) = (uint)fd; // store fd in low 32 bits + } + else + { + // events @0 (4 bytes), pad 4, data @8 (8 bytes) + *(uint*)dest = events; + *(ulong*)((byte*)dest + 8) = (uint)fd; + } + } + + /// + /// Reads a Linux epoll_event structure from unmanaged memory and extracts its fields. + /// + /// Pointer to the source buffer containing the epoll_event structure. + /// Outputs the event flags (EPOLLIN, EPOLLOUT, etc.). + /// Outputs the associated file descriptor. + /// + /// Reads using the correct layout depending on the flag. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void ReadEpollEvent(void* src, out uint events, out int fd) + { + if (Packed) + { + events = *(uint*)src; + fd = (int)*(uint*)((byte*)src + 4); + } + else + { + events = *(uint*)src; + fd = (int)*(uint*)((byte*)src + 8); + } + } + + // Variations, TODO: Test performance required + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] + internal static void WriteEpollEvent2(void* dest, uint events, int fd) + { + // Write events (always aligned 4B store) + *(uint*)dest = events; + // Compute data offset (packed: +4, natural: +8) + var data = (byte*)dest + (Packed ? 4 : 8); + // Store only low 32 bits of fd and zero the high 32 bits. + // Using two 4B stores avoids an unaligned 8B write in the packed layout. + *(uint*)data = (uint)fd; // low 32 + *(uint*)(data + 4) = 0; // high 32 + } + + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] + internal static void ReadEpollEvent2(void* src, out uint events, out int fd) + { + events = *(uint*)src; + var data = (byte*)src + (Packed ? 4 : 8); + // We only ever wrote the low 32 bits; read exactly those. + fd = (int)*(uint*)data; + } + + // ============================================================================================= + // Networking helpers + // ============================================================================================= + + /// + /// Converts a 16-bit unsigned integer from host byte order to network byte order (big-endian). + /// + /// The value to convert. + /// The converted value in network byte order. + /// + /// Equivalent to the native htons() function from the BSD sockets API. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ushort Htons(ushort x) => + BitConverter.IsLittleEndian ? BinaryPrimitives.ReverseEndianness(x) : x; +} \ No newline at end of file diff --git a/Shrike/Engine/Connection.cs b/Shrike/Engine/Connection.cs new file mode 100644 index 0000000..7dfd86e --- /dev/null +++ b/Shrike/Engine/Connection.cs @@ -0,0 +1,245 @@ +using System.Threading.Tasks.Sources; + +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +#pragma warning disable CA2014 + +namespace Shrike; + +/// +/// Per-connection state with Minima-style IVTS on the read and flush paths. +/// +/// The epoll worker is the DRIVER: on EPOLLIN it drains recv into +/// then calls ; on EPOLLOUT +/// it continues and, when drained, calls . +/// The per-connection handler loop awaits / ; +/// because RunContinuationsAsynchronously = false, those continuations run inline +/// on the worker thread — the handler and the driver are the same thread (cooperative, +/// single-threaded), exactly like Minima's reactor. +/// +[SkipLocalsInit] +public sealed unsafe class Connection : IValueTaskSource, IValueTaskSource, IDisposable +{ + public enum FlushResult { Complete, Incomplete, Close } + + // ---- recv window: valid bytes in [Head .. Tail) ---- + public int Head, Tail; + public readonly byte* ReceiveBuffer; + private readonly int _inSlabSize; + + // ---- send buffer ---- + public readonly FixedBufferWriter WriteBuffer; + + // ---- per-request parsed header (no allocations) ---- + public BinaryH1HeaderData BinaryH1HeaderData { get; set; } + public H1HeaderData H1HeaderData { get; set; } = null!; + + // ---- epoll wiring (set when the connection is bound to a live fd) ---- + public int Fd; + public int Ep; + + // ---- read IVTS (result = isClosed) ---- + private ManualResetValueTaskSourceCore _readSignal = new() { RunContinuationsAsynchronously = false }; + private int _armed; + private int _pending; + private int _closed; + + // ---- flush IVTS ---- + private ManualResetValueTaskSourceCore _flushSignal = new() { RunContinuationsAsynchronously = false }; + private int _flushArmed; + + public Connection(int maxConnections, int inSlabSize, int outSlabSize) + { + _inSlabSize = inSlabSize; + ReceiveBuffer = (byte*)NativeMemory.AlignedAlloc((nuint)inSlabSize, 64); + WriteBuffer = new FixedBufferWriter((byte*)NativeMemory.AlignedAlloc((nuint)outSlabSize, 64), outSlabSize); + } + + /// + /// Parse the next complete HTTP request from the recv window into + /// and advance past it. Returns false when no + /// more complete requests remain, compacting any trailing partial to the front + /// so the next recv appends after it. Call in a loop after . + /// + public bool TryReadRequest() + { + int idx = 0; + ReadOnlySpan headerSpan = FindCrlfCrlf(ReceiveBuffer, Head, Tail, ref idx); + if (idx < 0) + { + Compact(); + return false; + } + BinaryH1HeaderData = ExtractBinaryH1HeaderData(headerSpan); + Head = idx + 4; // advance past CRLFCRLF + return true; + } + + private void Compact() + { + if (Head > 0 && Head < Tail) + { + int length = Tail - Head; + Buffer.MemoryCopy(ReceiveBuffer + Head, ReceiveBuffer, _inSlabSize, length); + Head = 0; + Tail = length; + } + else + { + Head = Tail = 0; + } + } + + /// Bind to a fresh fd taken from the pool and reset all per-connection state. + public void Reset(int fd, int ep) + { + Fd = fd; + Ep = ep; + Head = Tail = 0; + WriteBuffer.Reset(); + Volatile.Write(ref _armed, 0); + Volatile.Write(ref _pending, 0); + Volatile.Write(ref _closed, 0); + Volatile.Write(ref _flushArmed, 0); + _readSignal.Reset(); + _flushSignal.Reset(); + } + + public void Clear() => H1HeaderData?.Clear(); + + public bool IsClosed => Volatile.Read(ref _closed) != 0; + + // ============================ READ ============================ + + public ValueTask ReadAsync() + { + if (Volatile.Read(ref _pending) == 1) + { + Volatile.Write(ref _pending, 0); + return new ValueTask(Volatile.Read(ref _closed) != 0); + } + if (Volatile.Read(ref _closed) != 0) + return new ValueTask(true); + + _readSignal.Reset(); + Volatile.Write(ref _armed, 1); + + // Lost-wakeup guard: data/close may have raced in just before we armed. + if (Volatile.Read(ref _pending) == 1 || Volatile.Read(ref _closed) != 0) + { + Volatile.Write(ref _pending, 0); + Volatile.Write(ref _armed, 0); + return new ValueTask(Volatile.Read(ref _closed) != 0); + } + return new ValueTask(this, _readSignal.Version); + } + + /// Worker thread: recv data is in the buffer — wake the handler's ReadAsync. + public void SignalReadable() + { + if (Interlocked.Exchange(ref _armed, 0) == 1) + _readSignal.SetResult(Volatile.Read(ref _closed) != 0); + else + Volatile.Write(ref _pending, 1); + } + + // ============================ FLUSH ============================ + + public ValueTask FlushAsync() + { + FlushResult r = TryFlush(); + if (r == FlushResult.Complete) + return ValueTask.CompletedTask; + if (r == FlushResult.Close) + { + MarkClosed(); + return ValueTask.CompletedTask; // Serve observes IsClosed and exits + } + + // Partial / EAGAIN — wait for the worker to drain EPOLLOUT. + _flushSignal.Reset(); + Volatile.Write(ref _flushArmed, 1); + ArmEpollOut(); + return new ValueTask(this, _flushSignal.Version); + } + + /// Non-blocking send of everything staged in . + public FlushResult TryFlush() + { + while (true) + { + long remaining = WriteBuffer.Tail - WriteBuffer.Head; + if (remaining == 0) { WriteBuffer.Reset(); return FlushResult.Complete; } + + byte* head = WriteBuffer.Ptr + WriteBuffer.Head; + long n = send(Fd, head, remaining, MSG_NOSIGNAL); + if (n > 0) + { + if (n == remaining) { WriteBuffer.Reset(); return FlushResult.Complete; } + WriteBuffer.Head += (int)n; + continue; + } + + int err = (n == 0) ? EAGAIN : Marshal.GetLastPInvokeError(); + if (err is EAGAIN or EWOULDBLOCK) return FlushResult.Incomplete; + return FlushResult.Close; + } + } + + /// Worker thread: EPOLLOUT fully drained — wake the handler's FlushAsync. + public void CompleteFlush() + { + if (Interlocked.Exchange(ref _flushArmed, 0) == 1) + _flushSignal.SetResult(true); + } + + // ============================ CLOSE ============================ + + public void MarkClosed() + { + Volatile.Write(ref _closed, 1); + if (Interlocked.Exchange(ref _armed, 0) == 1) + _readSignal.SetResult(true); + else + Volatile.Write(ref _pending, 1); + + if (Interlocked.Exchange(ref _flushArmed, 0) == 1) + _flushSignal.SetResult(true); + } + + // ========================= epoll arming ========================= + + private void ArmEpollOut() + { + byte* ev = stackalloc byte[EvSize]; + WriteEpollEvent(ev, EPOLLOUT | EPOLLRDHUP | EPOLLERR | EPOLLHUP | EPOLLET, Fd); + epoll_ctl(Ep, EPOLL_CTL_MOD, Fd, (IntPtr)ev); + } + + public void ArmEpollIn() + { + byte* ev = stackalloc byte[EvSize]; + WriteEpollEvent(ev, EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP | EPOLLET, Fd); + epoll_ctl(Ep, EPOLL_CTL_MOD, Fd, (IntPtr)ev); + } + + // ===================== IValueTaskSource (read) ===================== + bool IValueTaskSource.GetResult(short token) => _readSignal.GetResult(token); + ValueTaskSourceStatus IValueTaskSource.GetStatus(short token) => _readSignal.GetStatus(token); + void IValueTaskSource.OnCompleted(Action continuation, object? state, short token, ValueTaskSourceOnCompletedFlags flags) + => _readSignal.OnCompleted(continuation, state, token, flags); + + // ======================= IValueTaskSource (flush) ======================= + void IValueTaskSource.GetResult(short token) => _flushSignal.GetResult(token); + ValueTaskSourceStatus IValueTaskSource.GetStatus(short token) => _flushSignal.GetStatus(token); + void IValueTaskSource.OnCompleted(Action continuation, object? state, short token, ValueTaskSourceOnCompletedFlags flags) + => _flushSignal.OnCompleted(continuation, state, token, flags); + + public void Dispose() + { + if (ReceiveBuffer != null) + NativeMemory.AlignedFree(ReceiveBuffer); + WriteBuffer.Dispose(); + GC.SuppressFinalize(this); + } +} diff --git a/Shrike/Engine/ShrikeEngine.Builder.cs b/Shrike/Engine/ShrikeEngine.Builder.cs new file mode 100644 index 0000000..b2d997c --- /dev/null +++ b/Shrike/Engine/ShrikeEngine.Builder.cs @@ -0,0 +1,174 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// (var is avoided intentionally in this project so that concrete types are visible at call sites.) +// ReSharper disable always StackAllocInsideLoop +// ReSharper disable always ClassCannotBeInstantiated +#pragma warning disable CA2014 + +namespace Shrike; + +public sealed partial class ShrikeEngine +{ + /// + /// Factory for configuring and constructing an . + /// Usage: + /// + /// var engine = ShrikeEngine.CreateBuilder() + /// .SetPort(8080) + /// .SetBacklog(16384) + /// .SetMaxNumberConnectionsPerWorker(512) + /// .SetSlabSizes(16 * 1024, 16 * 1024) + /// .SetMaxEventsPerWake(512) + /// .SetMaxStackSizePerThread(1024 * 1024) + /// .SetNWorkersSolver(() => Environment.ProcessorCount / 2) + /// .InjectRequestHandler(conn => { /* write response */ }) + /// .Build(); + /// + /// + public static ShrikeBuilder CreateBuilder() => new ShrikeBuilder(); + + // ===== Engine-wide configuration (static for now; set via builder) ===== + private static int _port = 8080; + private static int _backlog = 16384; + private static int _maxNumberConnectionsPerWorker = 512; + private static int _inSlabSize = 16 * 1024; + private static int _outSlabSize = 16 * 1024; + private static int _maxEventsPerWake = 512; + private static int _maxStackSizePerThread = 1024 * 1024; + private static int _nWorkers; + + private static Func? _calculateNumberWorkers; + + // Default per-connection handler loop (overridden via builder). Demonstrates the + // Minima-style model: await ReadAsync, parse + write, await FlushAsync. + private static Func _sHandler = DefaultHandler; + + private static async Task DefaultHandler(Connection conn) + { + while (true) + { + if (await conn.ReadAsync()) return; // wait for data; true => peer closed + bool wrote = false; + while (conn.TryReadRequest()) + { + conn.WriteBuffer.WriteUnmanaged("HTTP/1.1 200 OK\r\n"u8 + + "Server: S\r\n"u8 + + "Content-Type: text/plain\r\n"u8 + + "Content-Length: 28\r\n\r\n"u8 + + "Request handler was not set!"u8); + conn.Clear(); + wrote = true; + } + if (wrote) await conn.FlushAsync(); + } + } + + private ShrikeEngine() { } + + /// + /// Fluent builder for . This configures static fields, + /// then finalizes OS resources (listen socket) and decides worker count. + /// + public sealed class ShrikeBuilder + { + private readonly ShrikeEngine _engine; + public ShrikeBuilder() => _engine = new ShrikeEngine(); + + /// Set the TCP listening port (default: 8080). + public ShrikeBuilder SetPort(int port) + { + _port = port; + return this; + } + + /// + /// Set the listen backlog (default: 16384). This is a hint to the kernel for + /// the maximum pending connection queue length. + /// + public ShrikeBuilder SetBacklog(int backlog) + { + _backlog = backlog; + return this; + } + + /// + /// Max concurrent connections each worker is allowed to track (default: 512). + /// Used for per-worker capacity planning and slab sizing. + /// + public ShrikeBuilder SetMaxNumberConnectionsPerWorker(int maxNumberConnectionsPerWorker) + { + _maxNumberConnectionsPerWorker = maxNumberConnectionsPerWorker; + return this; + } + + /// + /// Configure input/output slab sizes used by connections (defaults: 16 KiB each). + /// + public ShrikeBuilder SetSlabSizes(int inSlabSize, int outSlabSize) + { + _inSlabSize = inSlabSize; + _outSlabSize = outSlabSize; + return this; + } + + /// + /// Operational batch size for epoll wait loops—how many events to pull per wake (default: 512). + /// Increase if workers often see saturated event bursts. + /// + public ShrikeBuilder SetMaxEventsPerWake(int maxEventsPerWake) + { + _maxEventsPerWake = maxEventsPerWake; + return this; + } + + /// + /// Max stack size per worker thread (default: 1 MiB). Useful when using large stackallocs + /// in hot loops; keep conservative to avoid stack overflows under deep recursion or bursts. + /// + public ShrikeBuilder SetMaxStackSizePerThread(int maxStackSizePerThread) + { + _maxStackSizePerThread = maxStackSizePerThread; + return this; + } + + /// + /// Provide a delegate that decides the worker count at build time. + /// If not provided, defaults to Environment.ProcessorCount / 2. + /// + public ShrikeBuilder SetNWorkersSolver(Func? solver) + { + _calculateNumberWorkers = solver; + return this; + } + + /// + /// Inject the per-connection handler loop. It owns the request lifecycle: + /// await conn.ReadAsync()while (conn.TryReadRequest()) { write }await conn.FlushAsync(). + /// + public ShrikeBuilder InjectHandler(Func handler) + { + _sHandler = handler; + return this; + } + + /// + /// Finalize configuration: + /// - Logs arch and epoll_event packing + /// - Creates and configures the listening socket + /// - Resolves worker count via solver or default heuristic + /// + public ShrikeEngine Build() + { + Console.WriteLine( + $"Arch={RuntimeInformation.ProcessArchitecture}, Packed={(Packed ? 12 : 16)}-byte epoll_event"); + + // Decide worker count (solver wins if supplied). + _nWorkers = _calculateNumberWorkers is null + ? Environment.ProcessorCount / 2 + : _calculateNumberWorkers(); + + return _engine; + } + } + +} \ No newline at end of file diff --git a/Shrike/Engine/ShrikeEngine.Runner.cs b/Shrike/Engine/ShrikeEngine.Runner.cs new file mode 100644 index 0000000..ce0c477 --- /dev/null +++ b/Shrike/Engine/ShrikeEngine.Runner.cs @@ -0,0 +1,37 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +#pragma warning disable CA2014 + +namespace Shrike; + +public sealed partial class ShrikeEngine +{ + /// + /// Boots the engine. Each worker owns its own epoll instance and its own + /// SO_REUSEPORT listen socket — the kernel load-balances accepts across them, + /// so there is no acceptor thread (this mirrors Minima's per-reactor model). + /// Worker 0 runs on the calling thread (blocks for the process lifetime); the + /// rest run on background threads. + /// + public void Run() + { + var workers = new Worker[_nWorkers]; + for (int i = 0; i < _nWorkers; i++) + workers[i] = new Worker(i, _maxEventsPerWake, _port, _backlog); + + Console.WriteLine($"Shrike listening on 0.0.0.0:{_port} with {_nWorkers} worker(s) (SO_REUSEPORT, no acceptor)"); + + for (int i = 1; i < _nWorkers; i++) + { + int iCap = i; + var t = new Thread(() => WorkerLoop(workers[iCap]), _maxStackSizePerThread) + { + IsBackground = true, + Name = $"worker-{iCap}" + }; + t.Start(); + } + + WorkerLoop(workers[0]); // run one worker on the calling thread; blocks + } +} diff --git a/Shrike/Engine/ShrikeEngine.Worker.cs b/Shrike/Engine/ShrikeEngine.Worker.cs new file mode 100644 index 0000000..8f6eda3 --- /dev/null +++ b/Shrike/Engine/ShrikeEngine.Worker.cs @@ -0,0 +1,157 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// ReSharper disable always StackAllocInsideLoop +#pragma warning disable CA2014 + +namespace Shrike; + +public sealed partial class ShrikeEngine +{ + // Pooled Connection instances (reused across fds to avoid per-connection allocation). + private static readonly ObjectPool ConnectionPool = + new DefaultObjectPool(new ConnectionPoolPolicy(), 1024 * 32); + + private sealed class ConnectionPoolPolicy : PooledObjectPolicy + { + public override Connection Create() => new(_maxNumberConnectionsPerWorker, _inSlabSize, _outSlabSize); + public override bool Return(Connection connection) { connection.Clear(); return true; } + } + + /// + /// Worker event loop. It is purely a DRIVER for each connection's IVTS — the + /// read/parse/write/flush loop lives in the injected handler (see HandleAsync), + /// which awaits /: + /// - EPOLLIN → drain recv → SignalReadable (resumes the handler inline) + /// - EPOLLOUT → continue the partial send → on drain CompleteFlush (resumes inline) + /// - error/hup/peer-close → MarkClosed (handler exits inline) → recycle + /// Continuations run inline (RunContinuationsAsynchronously=false), so handler and + /// driver share this one worker thread — cooperative, no cross-thread races. + /// + private static unsafe void WorkerLoop(Worker W) + { + var connections = new Dictionary(capacity: _maxNumberConnectionsPerWorker); + + for (;;) + { + int n = epoll_wait(W.Ep, W.EventsBuf, W.MaxEvents, -1); + if (n < 0) { if (Marshal.GetLastPInvokeError() == EINTR) continue; throw new Exception("epoll_wait worker"); } + + for (int i = 0; i < n; i++) + { + ReadEpollEvent((byte*)W.EventsBuf + i * EvSize, out uint evs, out int fd); + + // 1) Our own SO_REUSEPORT listener is readable — drain accepts. + if (fd == W.ListenFd) + { + for (;;) + { + int cfd = accept4(W.ListenFd, IntPtr.Zero, IntPtr.Zero, SOCK_NONBLOCK | SOCK_CLOEXEC); + if (cfd >= 0) + { + int one = 1; + setsockopt(cfd, IPPROTO_TCP, TCP_NODELAY, ref one, sizeof(int)); + + byte* ev = stackalloc byte[EvSize]; + WriteEpollEvent(ev, EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP | EPOLLET, cfd); + epoll_ctl(W.Ep, EPOLL_CTL_ADD, cfd, (IntPtr)ev); + + Connection c = ConnectionPool.Get(); + c.Reset(cfd, W.Ep); + connections[cfd] = c; + _ = RunHandler(c); // suspends immediately on its first ReadAsync + continue; + } + int err = Marshal.GetLastPInvokeError(); + if (err == EINTR) continue; + break; // EAGAIN/EWOULDBLOCK (drained) or transient error + } + continue; + } + + if (!connections.TryGetValue(fd, out var conn)) { CloseQuiet(fd); continue; } + + // 2) Error / hangup / remote half-close. + if ((evs & (EPOLLERR | EPOLLHUP | EPOLLRDHUP)) != 0) + { + conn.MarkClosed(); // resumes & exits the handler inline + CloseConn(fd, connections); + continue; + } + + // 3) Read-ready. + if ((evs & EPOLLIN) != 0) + { + if (!RecvDrain(conn, fd)) + { + conn.MarkClosed(); + CloseConn(fd, connections); + continue; + } + conn.SignalReadable(); // resume handler inline: parse + write + flush + if (conn.IsClosed) CloseConn(fd, connections); + continue; + } + + // 4) Write-ready (a previous flush was partial). + if ((evs & EPOLLOUT) != 0) + { + Connection.FlushResult r = conn.TryFlush(); + if (r == Connection.FlushResult.Complete) + { + conn.ArmEpollIn(); + conn.CompleteFlush(); // resume handler inline: loops back to ReadAsync + if (conn.IsClosed) CloseConn(fd, connections); + } + else if (r == Connection.FlushResult.Close) + { + conn.MarkClosed(); + CloseConn(fd, connections); + } + // Incomplete: stay armed for EPOLLOUT. + } + } + } + } + + /// Runs the injected per-connection handler; guarantees the connection is closed when it ends. + private static async Task RunHandler(Connection conn) + { + try { await _sHandler(conn); } + catch { /* handler faulted */ } + finally { conn.MarkClosed(); } // idempotent; the worker recycles once it observes IsClosed + } + + /// Drain recv into the connection buffer (edge-triggered: read until EAGAIN). False = peer closed / hard error. + private static unsafe bool RecvDrain(Connection conn, int fd) + { + while (true) + { + int avail = _inSlabSize - conn.Tail; + if (avail == 0) return true; // buffer full — handler must drain it (large-request limit, as in Unhinged) + + long got = recv(fd, conn.ReceiveBuffer + conn.Tail, (ulong)avail, 0); + if (got > 0) { conn.Tail += (int)got; continue; } + if (got == 0) return false; // peer closed + + int err = Marshal.GetLastPInvokeError(); + if (err is EAGAIN or EWOULDBLOCK) return true; // drained + if (err == EINTR) continue; + return false; // ECONNRESET / EPIPE / unexpected + } + } + + // ===== Close helpers ===== + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void CloseConn(int fd, Dictionary map) + { + if (map.Remove(fd, out var c)) + { + ConnectionPool.Return(c); + CloseQuiet(fd); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void CloseQuiet(int fd) { try { close(fd); } catch { /* best effort */ } } +} diff --git a/Shrike/Engine/ShrikeEngine.cs b/Shrike/Engine/ShrikeEngine.cs new file mode 100644 index 0000000..2588dae --- /dev/null +++ b/Shrike/Engine/ShrikeEngine.cs @@ -0,0 +1,127 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// (var is avoided intentionally in this project so that concrete types are visible at call sites.) +// ReSharper disable always StackAllocInsideLoop +// ReSharper disable always ClassCannotBeInstantiated +#pragma warning disable CA2014 + + +namespace Shrike; + +/// +/// ShrikeEngine — a minimal, high-performance HTTP engine core built around +/// Linux epoll and eventfd, written in C# with unsafe paths and pooling to +/// minimize allocations and syscalls. This class owns process-wide configuration, socket +/// initialization, worker creation, the accept loop, and the request/response I/O pipeline. +/// +/// +/// Design goals +/// +/// Throughput first: epoll-driven, non-blocking I/O; batched accept and send. +/// Predictable latency: avoid cross-thread contention; per-worker ownership of fds. +/// Low GC pressure: slabbed receive/write buffers; pooled Connection instances. +/// Simple mental model: one acceptor + N workers; explicit state transitions. +/// +/// +/// High-level architecture +/// +/// +/// +/// Acceptor thread (runs on the caller thread): +/// +/// Owns the listening socket (_listenFd), set to O_NONBLOCK. +/// Has a dedicated epoll fd and waits for EPOLLIN on the listen socket. +/// On readiness, drains accept4() in a loop until EAGAIN. +/// For each accepted cfd, selects the least-busy worker (by Worker.Current), enqueues the fd into the worker’s inbox, and signals the worker via eventfd. +/// +/// +/// +/// +/// +/// Workers (_nWorkers threads): +/// +/// Each worker owns an epoll instance, an eventfd (NotifyEfd), and an fd→Connection map. +/// On eventfd wakeup, they dequeue new client fds and register them for EPOLLIN|EPOLLRDHUP|EPOLLERR|EPOLLHUP. +/// EPOLLIN: recv into the per-connection receive slab; parse complete requests; write responses into the write slab; try to flush immediately. +/// EPOLLOUT: continue flushing partial responses; on drain, re-arm EPOLLIN. +/// On error/hangup, the worker closes the fd and returns the Connection to the pool. +/// +/// +/// +/// +/// +/// Core data structures +/// +/// Connection: pooled object containing receive/write slabs (unsafe pointers), head/tail indices, and lightweight per-request state (e.g., hashed route). +/// ObjectPool<Connection>: reduces allocation churn under load; return path can reset buffers. +/// Header parsing helpers: naive but vectorized IndexOf(CRLF/CRLFCRLF), and a route hash via Fnv1a32. +/// +/// +/// I/O flow (hot path) +/// +/// recv → accumulate bytes in ReceiveBuffer [Head..Tail). +/// Find \r\n\r\n; on full header → extract target → _sRequestHandler(connection) writes response into WriteBuffer. +/// send(MSG_NOSIGNAL) until EAGAIN or fully flushed; if partial → arm EPOLLOUT; if drained → arm EPOLLIN. +/// +/// +/// Threading & synchronization +/// +/// Acceptor is single-threaded; workers are OS threads with configurable stack size (defaults 1 MiB). +/// Work distribution via lock-free queue (worker inbox) + eventfd wakeup (8-byte increments). +/// Worker.Current is adjusted with Interlocked; least-busy selection uses Volatile.Read inside a simple O(N) scan. +/// +/// +/// Configuration knobs (set via ShrikeBuilder) +/// +/// _port, _backlog: listener endpoint and queue size. +/// _maxNumberConnectionsPerWorker: map capacity & planning per worker. +/// _inSlabSize/_outSlabSize: receive/write slab sizes (defaults 16 KiB). +/// _maxEventsPerWake: epoll batch size per wait. +/// _maxStackSizePerThread: worker thread stack (useful for stackalloc heavy code paths). +/// _calculateNumberWorkers: custom worker count heuristic; defaults to ProcessorCount / 2. +/// _sRequestHandler: application callback that writes to WriteBuffer. +/// +/// +/// Socket & epoll setup +/// +/// Listener: socket(AF_INET, SOCK_STREAM|CLOEXEC, IPPROTO_TCP), SO_REUSEADDR, O_NONBLOCK, bind(0.0.0.0:port), listen(backlog). +/// Acceptor epoll: monitor EPOLLIN|EPOLLERR|EPOLLHUP on the listen fd. +/// Accepted sockets: TCP_NODELAY, SO_LINGER off, O_NONBLOCK (via accept4 flags). +/// +/// +/// Error handling +/// +/// EINTR/EAGAIN/EWOULDBLOCK are treated as transient. +/// ECONNRESET/ECONNABORTED/EPIPE close the connection. +/// Unexpected errors during recv/send/ctl cause a quiet close; the worker load counter is decremented. +/// +/// +/// Performance notes +/// +/// Batching: accept loop drains in one epoll tick; send loop attempts to drain buffer before arming EPOLLOUT. +/// Pooling: Connection reuse avoids per-request allocations; slabs are fixed-size for cache locality. +/// Vectorized searches: Span<>.IndexOf on constants like CRLFCRLF leverages SIMD on modern runtimes. +/// +/// +/// Safety & invariants +/// +/// All fds registered in a worker’s epoll must be serviced only by that worker. +/// For ReadOnlySpan over byte*, callers guarantee the [Head..Tail) window is valid and pinned for the call duration. +/// WriteBuffer and ReceiveBuffer head/tail invariants are maintained by Connection methods and parsing routines. +/// +/// +/// Extensibility +/// +/// Custom routing/dispatch: swap _sRequestHandler; keep response writes contiguous for fewer syscalls. +/// Back-pressure: introduce per-fd send windowing or global egress caps if needed. +/// HTTP parsing: current header scan is naive; can be upgraded with SIMD or a finite-state parser. +/// +/// +/// Limitations / TODO +/// +/// No dynamic slab growth yet; large headers/bodies can overflow the configured receive slab. +/// Request pipelining supported at header-level; body handling and chunked decoding not implemented here. +/// Minimal logging/telemetry; production builds should integrate structured, sampling-friendly logs. +/// +/// +public sealed partial class ShrikeEngine { } \ No newline at end of file diff --git a/Shrike/Engine/Worker.cs b/Shrike/Engine/Worker.cs new file mode 100644 index 0000000..24027e7 --- /dev/null +++ b/Shrike/Engine/Worker.cs @@ -0,0 +1,84 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// ReSharper disable always StackAllocInsideLoop +#pragma warning disable CA2014 + +namespace Shrike; + +/// +/// One worker = one thread + one epoll instance + its OWN SO_REUSEPORT listen +/// socket. Like Minima's reactor: the kernel load-balances accepts across the N +/// per-worker listeners, so there's no acceptor thread, no fd handoff, and no +/// eventfd. Each worker accepts and serves its own connections end to end. +/// +[SkipLocalsInit] +internal sealed unsafe class Worker : IDisposable +{ + internal readonly int Index; + + // This worker's own epoll instance (client fds + the listen socket). + internal readonly int Ep; + + // This worker's own SO_REUSEPORT listening socket. + internal readonly int ListenFd; + + // Unmanaged buffer for epoll_wait() results (avoids per-call allocation). + internal readonly IntPtr EventsBuf; + + // Max events per epoll_wait() batch. + internal readonly int MaxEvents; + + internal Worker(int idx, int maxEvents, int port, int backlog) + { + Index = idx; + MaxEvents = maxEvents; + + Ep = epoll_create1(EPOLL_CLOEXEC); + if (Ep < 0) + throw new Exception("epoll_create1 failed"); + + ListenFd = OpenReusePortListener(port, backlog); + + // Register our own listen socket (level-triggered: drain accepts each wake). + byte* ev = stackalloc byte[EvSize]; + WriteEpollEvent(ev, EPOLLIN | EPOLLERR | EPOLLHUP, ListenFd); + if (epoll_ctl(Ep, EPOLL_CTL_ADD, ListenFd, (IntPtr)ev) != 0) + throw new Exception("epoll_ctl ADD listen failed"); + + EventsBuf = Marshal.AllocHGlobal(EvSize * MaxEvents); + } + + /// One non-blocking SO_REUSEPORT listener per worker; the kernel balances accepts across them. + private static int OpenReusePortListener(int port, int backlog) + { + int fd = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (fd < 0) throw new Exception($"socket failed errno={Marshal.GetLastPInvokeError()}"); + + int one = 1; + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, ref one, sizeof(int)); + setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, ref one, sizeof(int)); + + int fl = fcntl(fd, F_GETFL, 0); + if (fl >= 0) fcntl(fd, F_SETFL, fl | O_NONBLOCK); + + var addr = new sockaddr_in + { + sin_family = (ushort)AF_INET, + sin_port = Htons((ushort)port), + sin_addr = new in_addr { s_addr = 0 }, // 0.0.0.0 + sin_zero = new byte[8] + }; + if (bind(fd, ref addr, (uint)Marshal.SizeOf()) != 0) + throw new Exception($"bind failed errno={Marshal.GetLastPInvokeError()}"); + if (listen(fd, backlog) != 0) + throw new Exception($"listen failed errno={Marshal.GetLastPInvokeError()}"); + return fd; + } + + public void Dispose() + { + try { if (Ep >= 0) close(Ep); } catch { /* ignore */ } + try { if (ListenFd >= 0) close(ListenFd); } catch { /* ignore */ } + if (EventsBuf != IntPtr.Zero) Marshal.FreeHGlobal(EventsBuf); + } +} diff --git a/Shrike/HttpProtocol/H1/BinaryH1HeaderData.cs b/Shrike/HttpProtocol/H1/BinaryH1HeaderData.cs new file mode 100644 index 0000000..4def921 --- /dev/null +++ b/Shrike/HttpProtocol/H1/BinaryH1HeaderData.cs @@ -0,0 +1,14 @@ +namespace Shrike; + +public struct BinaryH1HeaderData +{ + public PinnedByteSequence HttpMethod; + + public PinnedByteSequence Route; + + public PinnedByteSequence QueryParameters; + + public bool HasQueryParameters => QueryParameters.Length > 0; + + public PinnedByteSequence Headers; +} \ No newline at end of file diff --git a/Shrike/HttpProtocol/H1/CachedH1Data.cs b/Shrike/HttpProtocol/H1/CachedH1Data.cs new file mode 100644 index 0000000..16eeb8e --- /dev/null +++ b/Shrike/HttpProtocol/H1/CachedH1Data.cs @@ -0,0 +1,38 @@ +namespace Shrike; + +internal static class CachedH1Data +{ + internal static readonly StringCache CachedRoutes + = new(null, 64); + + internal static readonly StringCache CachedQueryKeys + = new(null, 64); + + internal static readonly StringCache CachedHttpMethods + = new([ + "GET", + "POST", + "PUT", + "DELETE", + "PATCH", + "HEAD", + "OPTIONS", + "TRACE"], + 8); + + internal static readonly StringCache CachedHeaderKeys + = new([ + "Host", + "User-Agent", + "Cookie", + "Accept", + "Accept-Language", + "Connection"], + 64); + + internal static readonly StringCache CachedHeaderValues + = new([ + "keep-alive", + "server"], + 64); +} \ No newline at end of file diff --git a/Shrike/HttpProtocol/H1/H1HeaderData.cs b/Shrike/HttpProtocol/H1/H1HeaderData.cs new file mode 100644 index 0000000..5467cbc --- /dev/null +++ b/Shrike/HttpProtocol/H1/H1HeaderData.cs @@ -0,0 +1,26 @@ +namespace Shrike; + +public class H1HeaderData +{ + public string Route { get; internal set; } = null!; + public string HttpMethod { get; internal set; } = null!; + public PooledDictionary QueryParameters { get; } + public PooledDictionary Headers { get; } + + public H1HeaderData() + { + QueryParameters = new PooledDictionary( + capacity: 8, + comparer: StringComparer.OrdinalIgnoreCase); + + Headers = new PooledDictionary( + capacity: 8, + comparer: StringComparer.OrdinalIgnoreCase); + } + + public void Clear() + { + Headers?.Clear(); + QueryParameters?.Clear(); + } +} \ No newline at end of file diff --git a/Shrike/HttpProtocol/H1/HeaderParsing.cs b/Shrike/HttpProtocol/H1/HeaderParsing.cs new file mode 100644 index 0000000..f0097b9 --- /dev/null +++ b/Shrike/HttpProtocol/H1/HeaderParsing.cs @@ -0,0 +1,269 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// (var is avoided intentionally in this project so that concrete types are visible at call sites.) +// ReSharper disable always StackAllocInsideLoop +// ReSharper disable always ClassCannotBeInstantiated + +using System.Text; + +#pragma warning disable CA2014 + +namespace Shrike; + +// Very naive +internal static unsafe class HeaderParsing +{ + /// + /// Finds the first occurrence of CRLFCRLF in a managed byte buffer slice [head..tail). + /// Returns the absolute index (relative to buf) of the '\r' in the sequence, or -1 if not found. + /// + /// + /// Fast path: relies on which is vectorized on modern runtimes. + /// The caller must ensure 0 ≤ head ≤ tail ≤ buf.Length. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int FindCrlfCrlf(byte[] buf, int head, int tail) + { + int idx = buf.AsSpan(head, tail - head).IndexOf(CrlfCrlf); + return idx >= 0 ? head + idx : -1; + } + + /// + /// Finds CRLFCRLF within an unmanaged region addressed by in [head..tail). + /// Returns the absolute index (relative to the same coordinate system as head/tail) of the '\r' or -1. + /// + /// + /// The caller must guarantee that the memory range [buf+head, buf+tail) is valid and readable. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int FindCrlfCrlf(byte* buf, int head, int tail) + { + // Construct a Span view over the raw memory. + // The caller must guarantee that (tail - head) bytes are valid and readable. + var span = new ReadOnlySpan(buf + head, tail - head); + + int idx = span.IndexOf(CrlfCrlf); + return idx >= 0 ? head + idx : -1; + } + + /// + /// Same as the unmanaged overload, but also returns a view over [head..tail). + /// Useful to avoid reconstructing the span twice (for parsing the request-line after the sentinel is found). + /// + /// Base pointer to the buffer. + /// Start offset (inclusive). + /// End offset (exclusive). + /// + /// Out: absolute index of the '\r' starting the CRLFCRLF sentinel, or -1 if not found. + /// + /// A span over the provided range [head..tail). + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ReadOnlySpan FindCrlfCrlf(byte* buf, int head, int tail, ref int idx) + { + // Construct a Span view over the raw memory. + // The caller must guarantee that (tail - head) bytes are valid and readable. + var span = new ReadOnlySpan(buf + head, tail - head); + + idx = span.IndexOf(CrlfCrlf); + if (idx >= 0) + idx += head; + else + idx = -1; + + return span; + } + + /// + /// Extracts and hashes the HTTP request target from the request line (e.g., "GET /path?x=1 HTTP/1.1"). + /// Expects to begin at the start of the request line and contain at least + /// the first CRLF. Throws if the request-line is malformed. + /// + /// + /// Parsing steps: + /// 1) Find the first CRLF to isolate the request line. + /// 2) Find first and second spaces: METHOD SP REQUEST-TARGET SP HTTP-VERSION. + /// 3) Slice the REQUEST-TARGET and hash it with FNV-1a 32-bit. + /// + /// Notes: + /// - The hash is over the raw byte sequence of the target (no decoding / normalization). + /// - Query string is preserved in the slice; callers can change hashing if they want to ignore it. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static uint ExtractRoute(ReadOnlySpan headerSpan) + { + var lineEnd = headerSpan.IndexOf(Crlf); + var firstHeader = headerSpan[..lineEnd]; + + var firstSpace = firstHeader.IndexOf(Space); + if (firstSpace == -1) + throw new InvalidOperationException("Invalid request line"); + + var secondSpaceRelative = firstHeader[(firstSpace + 1)..].IndexOf(Space); + if (secondSpaceRelative == -1) + throw new InvalidOperationException("Invalid request line"); + + var secondSpace = firstSpace + secondSpaceRelative + 1; + + // REQUEST-TARGET slice: may include path + query (e.g., "/foo?bar=baz") + var url = firstHeader[(firstSpace + 1)..secondSpace]; + + return Fnv1a32(url); + } + + internal static BinaryH1HeaderData ExtractBinaryH1HeaderData(ReadOnlySpan headerSpan) + { + var headerData = new BinaryH1HeaderData(); + + var lineEnd = headerSpan.IndexOf(Crlf); + var firstHeader = headerSpan[..lineEnd]; + + var firstSpace = firstHeader.IndexOf(Space); + if (firstSpace == -1) + throw new InvalidOperationException("Invalid request line"); + + headerData.HttpMethod = new PinnedByteSequence(firstHeader[..firstSpace]); + + var secondSpaceRelative = firstHeader[(firstSpace + 1)..].IndexOf(Space); + if (secondSpaceRelative == -1) + throw new InvalidOperationException("Invalid request line"); + + var secondSpace = firstSpace + secondSpaceRelative + 1; + + // REQUEST-TARGET slice: may include path + query (e.g., "/foo?bar=baz") + var url = firstHeader[(firstSpace + 1)..secondSpace]; + + var queryParamSeparator = url.IndexOf(Question); + + if (queryParamSeparator == -1) + { + headerData.Route = new PinnedByteSequence(url); + } + else + { + headerData.Route = new PinnedByteSequence(url[..queryParamSeparator]); + headerData.QueryParameters = new PinnedByteSequence(url[(queryParamSeparator + 1)..]); + } + + // Get the rest of the headers + + headerData.Headers = new PinnedByteSequence(headerSpan[(lineEnd + 2)..]); + + return headerData; + } + + internal static H1HeaderData ExtractH1HeaderData(ReadOnlySpan headerSpan) + { + var headerData = new H1HeaderData(); + + var lineEnd = headerSpan.IndexOf(Crlf); + var firstHeader = headerSpan[..lineEnd]; + + var firstSpace = firstHeader.IndexOf(Space); + if (firstSpace == -1) + throw new InvalidOperationException("Invalid request line"); + + if (CachedH1Data.CachedHttpMethods.TryGetOrAdd(firstHeader[..firstSpace], out var httpMethod)) + { + headerData.HttpMethod = httpMethod; + } + + var secondSpaceRelative = firstHeader[(firstSpace + 1)..].IndexOf(Space); + if (secondSpaceRelative == -1) + throw new InvalidOperationException("Invalid request line"); + + var secondSpace = firstSpace + secondSpaceRelative + 1; + + // REQUEST-TARGET slice: may include path + query (e.g., "/foo?bar=baz") + var url = firstHeader[(firstSpace + 1)..secondSpace]; + + var queryParamSeparator = url.IndexOf(Question); + + if (queryParamSeparator == -1) + { + if (CachedH1Data.CachedHttpMethods.TryGetOrAdd(url, out var route)) + { + headerData.Route = route; + } + } + else + { + if (CachedH1Data.CachedHttpMethods.TryGetOrAdd(url[..queryParamSeparator], out var route)) + { + headerData.Route = route; + } + + var querySpan = url[(queryParamSeparator + 1)..]; + var current = 0; + + + while (current < querySpan.Length) + { + var separator = querySpan[current..].IndexOf(QuerySeparator); // (byte)'&' + ReadOnlySpan pair; + + if (separator == -1) + { + pair = querySpan[current..]; + current = querySpan.Length; + } + else + { + pair = querySpan.Slice(current, separator); + current += separator + 1; + } + + var equalsIndex = pair.IndexOf(Equal); + if (equalsIndex == -1) + break; + + headerData.QueryParameters!.TryAdd(CachedH1Data.CachedQueryKeys.GetOrAdd(pair[..equalsIndex]), + Encoding.UTF8.GetString(pair[(equalsIndex + 1)..])); + } + + // Parse remaining headers + + var lineStart = 0; + while (true) + { + lineStart += lineEnd + 2; + + lineEnd = headerSpan[lineStart..].IndexOf("\r\n"u8); + if (lineEnd == 0) + { + // All Headers read + break; + } + + var header = headerSpan.Slice(lineStart, lineEnd); + var colonIndex = header.IndexOf(Colon); + + if (colonIndex == -1) + { + // Malformed header + continue; + } + + var headerKey = header[..colonIndex]; + var headerValue = header[(colonIndex + 2)..]; + + headerData.Headers!.TryAdd(CachedH1Data.CachedHeaderKeys.GetOrAdd(headerKey), + CachedH1Data.CachedHeaderValues.GetOrAdd(headerValue)); + } + } + + return headerData; + } + + // ===== Common tokens (kept as ReadOnlySpan for zero-allocation literals) ===== + + private static ReadOnlySpan Crlf => "\r\n"u8; + private static ReadOnlySpan CrlfCrlf => "\r\n\r\n"u8; + + // ASCII byte codes (documented for clarity) + private const byte Space = 0x20; // ' ' + private const byte Question = 0x3F; // '?' + private const byte QuerySeparator = 0x26; // '&' + private const byte Equal = 0x3D; // '=' + private const byte Colon = 0x3A; // ':' + private const byte SemiColon = 0x3B; // ';' +} \ No newline at end of file diff --git a/Shrike/HttpProtocol/H1/StringCache.cs b/Shrike/HttpProtocol/H1/StringCache.cs new file mode 100644 index 0000000..e7e7c1a --- /dev/null +++ b/Shrike/HttpProtocol/H1/StringCache.cs @@ -0,0 +1,113 @@ +using System.Text; + +namespace Shrike; + +internal class StringCache +{ + private readonly Dictionary _map; + + private readonly Lock _gate = new(); + + public StringCache(List? preCacheableStrings, int capacity = 256) + { + _map = new Dictionary(capacity, PinnedByteSequenceComparer.Instance); + + if (preCacheableStrings is null) + { + return; + } + + foreach (var preCacheableString in preCacheableStrings) + { + Add(preCacheableString); + } + } + + public string? GetOrAdd(ReadOnlySpan bytes) + { + var seq = new PinnedByteSequence(bytes); + + ref var item = ref CollectionsMarshal.GetValueRefOrNullRef(_map, seq); + if (!Unsafe.IsNullRef(ref item)) + { + return item; + } + + // Did not find a value, add it + var value = Encoding.UTF8.GetString(bytes); + if (TryAdd(seq, value)) + { + return value; + } + + return null; + } + + public bool TryGetOrAdd(ReadOnlySpan bytes, out string value) + { + var seq = new PinnedByteSequence(bytes); + + ref var item = ref CollectionsMarshal.GetValueRefOrNullRef(_map, seq); + if (!Unsafe.IsNullRef(ref item)) + { + value = item; + return true; + } + + // Did not find a value, add it + value = Encoding.UTF8.GetString(bytes); + return TryAdd(seq, value); + } + + private bool TryAdd(PinnedByteSequence key, string value) + { + var allocatedKey = AllocateSequence(key); + + lock (_gate) + { + return _map.TryAdd(allocatedKey, value); + } + } + + private unsafe PinnedByteSequence AllocateSequence(PinnedByteSequence sequence) + { + // Allocate pinned unmanaged slab + var ptr = (byte*)NativeMemory.AlignedAlloc((nuint)sequence.Length, 64); + + Buffer.MemoryCopy( + sequence.Ptr, + ptr, + sequence.Length, + sequence.Length); + + return new PinnedByteSequence(ptr, sequence.Length); + } + + private void Add(string item) + { + lock (_gate) + { + var bytes = Encoding.UTF8.GetBytes(item); + var seq = new PinnedByteSequence(bytes); + _map.TryAdd(seq, item); + } + } + + private sealed class PinnedByteSequenceComparer : IEqualityComparer + { + public static readonly PinnedByteSequenceComparer Instance = new(); + + public bool Equals(PinnedByteSequence x, PinnedByteSequence y) + { + return x.AsSpan().SequenceEqual(y.AsSpan()); + } + + public int GetHashCode(PinnedByteSequence mem) + { + var span = mem.AsSpan(); + var h = new HashCode(); + h.AddBytes(span); + return h.ToHashCode(); + } + } +} \ No newline at end of file diff --git a/Shrike/HttpProtocol/H1/StringCacheMemoryVariant.cs b/Shrike/HttpProtocol/H1/StringCacheMemoryVariant.cs new file mode 100644 index 0000000..87a99ef --- /dev/null +++ b/Shrike/HttpProtocol/H1/StringCacheMemoryVariant.cs @@ -0,0 +1,72 @@ +using System.Text; + +namespace Shrike; + +internal class StringCacheMemoryVariant +{ + private readonly Dictionary, string> _map; + + private readonly Lock _gate = new(); + + public StringCacheMemoryVariant(List? preCacheableStrings, int capacity = 256) + { + _map = new Dictionary, string>(capacity, ReadOnlyMemoryComparer.Instance); + + if (preCacheableStrings is null) + { + return; + } + + foreach (var preCacheableString in preCacheableStrings) + { + Add(preCacheableString); + } + } + + public bool TryGetOrAdd(ReadOnlyMemory bytes, out string value) + { + ref var item = ref CollectionsMarshal.GetValueRefOrNullRef(_map, bytes); + if (!Unsafe.IsNullRef(ref item)) + { + value = item; + return true; + } + + // Did not find a value, add it + value = Encoding.UTF8.GetString(bytes.Span); + return TryAdd(bytes, value); + } + + private bool TryAdd(ReadOnlyMemory key, string value) + { + lock (_gate) + { + return _map.TryAdd(key, value); + } + } + + private void Add(string item) + { + lock (_gate) + { + var bytes = Encoding.UTF8.GetBytes(item); + _map.TryAdd(bytes, item); + } + } + + private sealed class ReadOnlyMemoryComparer : IEqualityComparer> + { + public static readonly ReadOnlyMemoryComparer Instance = new(); + + public bool Equals(ReadOnlyMemory x, ReadOnlyMemory y) + => x.Span.SequenceEqual(y.Span); + + public int GetHashCode(ReadOnlyMemory mem) + { + var span = mem.Span; + var h = new HashCode(); + h.AddBytes(span); + return h.ToHashCode(); + } + } +} \ No newline at end of file diff --git a/Shrike/SerializableObjects/JsonMessage.cs b/Shrike/SerializableObjects/JsonMessage.cs new file mode 100644 index 0000000..a68d3fe --- /dev/null +++ b/Shrike/SerializableObjects/JsonMessage.cs @@ -0,0 +1,14 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// (var is avoided intentionally in this project so that concrete types are visible at call sites.) +// ReSharper disable always StackAllocInsideLoop +// ReSharper disable always ClassCannotBeInstantiated +#pragma warning disable CA2014 + +namespace Shrike; + +public struct JsonMessage { public string Message { get; set; } } + +[JsonSourceGenerationOptions(GenerationMode = JsonSourceGenerationMode.Serialization | JsonSourceGenerationMode.Metadata)] +[JsonSerializable(typeof(JsonMessage))] +public partial class JsonContext : JsonSerializerContext { } \ No newline at end of file diff --git a/Shrike/Shrike.csproj b/Shrike/Shrike.csproj new file mode 100644 index 0000000..8ab72ab --- /dev/null +++ b/Shrike/Shrike.csproj @@ -0,0 +1,15 @@ + + + + net10.0 + enable + enable + true + Shrike + + + + + + + diff --git a/Shrike/Utilities/DateHelper.cs b/Shrike/Utilities/DateHelper.cs new file mode 100644 index 0000000..920b286 --- /dev/null +++ b/Shrike/Utilities/DateHelper.cs @@ -0,0 +1,64 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// (var is avoided intentionally in this project so that concrete types are visible at call sites.) +// ReSharper disable always StackAllocInsideLoop +// ReSharper disable always ClassCannotBeInstantiated + +#pragma warning disable CA2014 + +namespace Shrike; + +// Stolen from asp net core platform benchmark :D +// TODO: Try a different approach + +/// +/// Manages the generation of the date header value. +/// +public static class DateHelper +{ + private const int PrefixLength = 6; // "Date: ".Length + private const int DateTimeRLength = 29; // Wed, 14 Mar 2018 14:20:00 GMT + private const int SuffixLength = 2; // crlf + private const int SuffixIndex = DateTimeRLength + PrefixLength; + + private static readonly Timer STimer = new((s) => { + SetDateValues(DateTimeOffset.UtcNow); + }, null, 1000, 1000); + + private static byte[] _sHeaderBytesMaster = new byte[PrefixLength + DateTimeRLength + 2 * SuffixLength]; + private static byte[] _sHeaderBytesScratch = new byte[PrefixLength + DateTimeRLength + 2 * SuffixLength]; + + static DateHelper() + { + var utf8 = "Date: "u8; + + utf8.CopyTo(_sHeaderBytesMaster); + utf8.CopyTo(_sHeaderBytesScratch); + _sHeaderBytesMaster[SuffixIndex] = (byte)'\r'; + _sHeaderBytesMaster[SuffixIndex + 1] = (byte)'\n'; + _sHeaderBytesMaster[SuffixIndex + 2] = (byte)'\r'; + _sHeaderBytesMaster[SuffixIndex + 3] = (byte)'\n'; + _sHeaderBytesScratch[SuffixIndex] = (byte)'\r'; + _sHeaderBytesScratch[SuffixIndex + 1] = (byte)'\n'; + _sHeaderBytesScratch[SuffixIndex + 2] = (byte)'\r'; + _sHeaderBytesScratch[SuffixIndex + 3] = (byte)'\n'; + + SetDateValues(DateTimeOffset.UtcNow); + SyncDateTimer(); + } + + private static void SyncDateTimer() => STimer.Change(1000, 1000); + public static ReadOnlySpan HeaderBytes => _sHeaderBytesMaster; + + private static void SetDateValues(DateTimeOffset value) + { + lock (_sHeaderBytesScratch) + { + if (!Utf8Formatter.TryFormat(value, _sHeaderBytesScratch.AsSpan(PrefixLength), out var written, 'R')) + throw new Exception("date time format failed"); + + //Debug.Assert(written == dateTimeRLength); + (_sHeaderBytesScratch, _sHeaderBytesMaster) = (_sHeaderBytesMaster, _sHeaderBytesScratch); + } + } +} \ No newline at end of file diff --git a/Shrike/Utilities/HashUtils.cs b/Shrike/Utilities/HashUtils.cs new file mode 100644 index 0000000..dd83e13 --- /dev/null +++ b/Shrike/Utilities/HashUtils.cs @@ -0,0 +1,93 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// (var is avoided intentionally in this project so that concrete types are visible at call sites.) +// ReSharper disable always StackAllocInsideLoop +// ReSharper disable always ClassCannotBeInstantiated + +#pragma warning disable CA2014 + +namespace Shrike; + +/// +/// Provides extremely lightweight hashing utilities optimized for short, hot-path inputs +/// (e.g. HTTP routes, header names, or method tokens). +/// +internal static class HashUtils +{ + /// + /// Computes a 32-bit FNV-1a (Fowler–Noll–Vo) hash of the given byte span. + /// + /// + /// FNV-1a is a simple, fast, non-cryptographic hash function designed for small inputs + /// and stable distribution. + /// + /// Formula: + /// + /// h = 2166136261 + /// for each byte b in data: + /// h = (h XOR b) * 16777619 + /// + /// + /// Characteristics: + /// • 32-bit unsigned integer output + /// • Deterministic and endian-independent + /// • Good avalanche behavior for short ASCII inputs + /// • Not suitable for security-critical use (collision-prone vs. modern hashes) + /// + /// Input data to hash (typically a small UTF-8 slice). + /// 32-bit unsigned FNV-1a hash of . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static uint Fnv1a32(ReadOnlySpan data) + { + const uint offset = 2166136261u; + const uint prime = 16777619u; + uint h = offset; + + for (int i = 0; i < data.Length; i++) + { + h ^= data[i]; + h *= prime; + } + + return h; + } + + /// + /// Computes an 8-bit FNV-1a hash of the given byte span. + /// + /// + /// Derived from the 32-bit version but truncated to 8 bits (mod 256). + /// This variant is extremely small and fast — ideal for quick indexing or hashing + /// into small tables, but collisions are frequent due to the 1-byte range. + /// + /// Formula: + /// + /// h = 0xA3 + /// for each byte b in data: + /// h = (h XOR b) * 0x9B + /// return h + /// + /// + /// Characteristics: + /// • Output range: 0–255 + /// • Very lightweight; no heap allocations + /// • Not stable for large datasets (collisions expected) + /// + /// Input data to hash. + /// 8-bit FNV-1a hash value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static byte Fnv1a8(ReadOnlySpan data) + { + const byte offset = 0xA3; // 163 decimal + const byte prime = 0x9B; // 155 decimal + byte h = offset; + + for (int i = 0; i < data.Length; i++) + { + h ^= data[i]; + unchecked { h = (byte)(h * prime); } + } + + return h; + } +} \ No newline at end of file diff --git a/Shrike/Utilities/PinnedByteSequence.cs b/Shrike/Utilities/PinnedByteSequence.cs new file mode 100644 index 0000000..551ec04 --- /dev/null +++ b/Shrike/Utilities/PinnedByteSequence.cs @@ -0,0 +1,39 @@ +namespace Shrike; + +public readonly unsafe struct PinnedByteSequence : IEquatable +{ + private readonly byte* _ptr { get; } + + public readonly byte* Ptr => _ptr; + + public int Length { get; } + + public PinnedByteSequence(ReadOnlySpan span) + { + _ptr = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(span)); + Length = span.Length; + } + + public PinnedByteSequence(byte* ptr, int length) + { + _ptr = ptr; + Length = length; + } + + public unsafe ReadOnlySpan AsSpan() => new(_ptr, Length); + + public bool Equals(PinnedByteSequence other) + { + return _ptr == other._ptr && Length == other.Length; + } + + public override bool Equals(object? obj) + { + return obj is PinnedByteSequence other && Equals(other); + } + + public override int GetHashCode() + { + return HashCode.Combine(unchecked((int)(long)_ptr), Length); + } +} \ No newline at end of file diff --git a/Shrike/Utilities/PooledDictionary.cs b/Shrike/Utilities/PooledDictionary.cs new file mode 100644 index 0000000..fc32b08 --- /dev/null +++ b/Shrike/Utilities/PooledDictionary.cs @@ -0,0 +1,285 @@ +using System.Collections; + +namespace Shrike; + +/// +/// Represents a high-performance, pooled dictionary that minimizes allocations by renting internal arrays from . +/// This structure is optimized for small, short-lived dictionaries such as HTTP headers or per-request state. +/// +/// The type of keys in the dictionary. Must implement . +/// The type of values in the dictionary. +public class PooledDictionary : IDictionary, IReadOnlyDictionary, IEnumerator> where TKey : IEquatable +{ + private static readonly ArrayPool> Pool = ArrayPool>.Shared; + + private short _enumerator = -1; + private ushort _index; + + private KeyValuePair[]? _entries; + + private readonly IEqualityComparer _comparer; + + #region Get-/Setters + + private KeyValuePair[] Entries => _entries ??= Pool.Rent(Capacity); + + private bool HasEntries => _entries is not null; + + public virtual TValue this[TKey key] + { + get + { + if (HasEntries) + { + for (var i = 0; i < _index; i++) + { + if (_comparer.Equals(Entries[i].Key, key)) + { + return Entries[i].Value; + } + } + } + + throw new KeyNotFoundException(); + } + set + { + if (HasEntries) + { + for (var i = 0; i < _index; i++) + { + if (_comparer.Equals(Entries[i].Key, key)) + { + Entries[i] = new KeyValuePair(key, value); + return; + } + } + } + + Add(key, value); + } + } + + public ICollection Keys + { + get + { + var result = new List(_index); + + if (HasEntries) + { + for (var i = 0; i < _index; i++) + { + result.Add(Entries[i].Key); + } + } + + return result; + } + } + + public ICollection Values + { + get + { + var result = new List(_index); + + if (HasEntries) + { + for (var i = 0; i < _index; i++) + { + result.Add(Entries[i].Value); + } + } + + return result; + } + } + + public int Count => _index; + + public bool IsReadOnly => false; + + IEnumerable IReadOnlyDictionary.Keys => Keys; + + IEnumerable IReadOnlyDictionary.Values => Values; + + public KeyValuePair Current => Entries[_enumerator]; + + object IEnumerator.Current => Entries[_enumerator]; + + public int Capacity { get; private set; } + + #endregion + + #region Initialization + + public PooledDictionary() : this(4, EqualityComparer.Default) + { + + } + + public PooledDictionary(int capacity, IEqualityComparer comparer) + { + Capacity = capacity; + + _comparer = comparer; + } + + #endregion + + #region Functionality + + public virtual void Add(TKey key, TValue value) + { + CheckResize(); + Entries[_index++] = new KeyValuePair(key, value); + } + + public virtual void Add(KeyValuePair item) + { + CheckResize(); + Entries[_index++] = item; + } + + public void Clear() + { + _index = 0; + } + + public bool Contains(KeyValuePair item) + { + if (HasEntries) + { + for (var i = 0; i < _index; i++) + { + if (_comparer.Equals(Entries[i].Key, item.Key)) + { + return true; + } + } + } + + return false; + } + + public bool ContainsKey(TKey key) + { + if (HasEntries) + { + for (var i = 0; i < _index; i++) + { + if (_comparer.Equals(Entries[i].Key, key)) + { + return true; + } + } + } + + return false; + } + + public void CopyTo(KeyValuePair[] array, int arrayIndex) + { + throw new NotSupportedException(); + } + + public IEnumerator> GetEnumerator() + { + _enumerator = -1; + return this; + } + + public bool Remove(TKey key) => throw new NotSupportedException(); + + public bool Remove(KeyValuePair item) => throw new NotSupportedException(); + + public bool TryGetValue(TKey key, out TValue value) + { + if (ContainsKey(key)) + { + value = this[key]; + return true; + } + +#pragma warning disable CS8653, CS8601 + value = default; +#pragma warning restore + + return false; + } + + IEnumerator IEnumerable.GetEnumerator() => this; + + public bool MoveNext() + { + _enumerator++; + return _enumerator < _index; + } + + public void Reset() + { + _enumerator = -1; + } + + private void CheckResize() + { + if (_index >= Entries.Length) + { + var oldEntries = Entries; + + try + { + if (oldEntries.Length > Capacity) + { + Capacity = oldEntries.Length * 2; + } + else + { + Capacity *= 2; + } + + _entries = Pool.Rent(Capacity); + + for (var i = 0; i < _index; i++) + { + Entries[i] = oldEntries[i]; + } + } + finally + { + Pool.Return(oldEntries); + } + } + } + + #endregion + + #region IDisposable Support + + private bool _disposed; + + private void Dispose(bool disposing) + { + if (_disposed) + return; + + if (disposing) + { + if (HasEntries) + { + Pool.Return(Entries); + } + } + + _disposed = true; + } + + public void Dispose() + { + Dispose(true); + } + + #endregion + +} \ No newline at end of file diff --git a/Shrike/Utilities/UnmanagedMemoryManager.cs b/Shrike/Utilities/UnmanagedMemoryManager.cs new file mode 100644 index 0000000..0bb447d --- /dev/null +++ b/Shrike/Utilities/UnmanagedMemoryManager.cs @@ -0,0 +1,83 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// (var is avoided intentionally in this project so that concrete types are visible at call sites.) +// ReSharper disable always StackAllocInsideLoop +// ReSharper disable always ClassCannotBeInstantiated +#pragma warning disable CA2014 + +namespace Shrike; + +/// +/// Provides a and abstraction +/// over a block of unmanaged memory, without taking ownership of that memory. +/// +/// This class allows interop scenarios where a pointer +/// is obtained externally (for example, via malloc, native buffers, +/// or stackalloc) and must be safely exposed as +/// to .NET APIs that expect it. +/// +/// +/// Important: This class does not allocate or free the unmanaged memory. +/// The caller is fully responsible for ensuring that the pointer remains valid +/// for the lifetime of the instance. +/// +/// +public sealed unsafe class UnmanagedMemoryManager : MemoryManager +{ + private readonly byte* _ptr; + private readonly int _length; + + /// + /// Initializes a new instance of the class + /// over an existing unmanaged memory block. + /// + /// A pointer to the start of the unmanaged memory block. + /// The length of the memory block, in bytes. + /// Thrown if is negative. + /// Thrown if is . + public UnmanagedMemoryManager(byte* ptr, int length) + { + if (ptr == null) + throw new ArgumentNullException(nameof(ptr)); + + if (length < 0) + throw new ArgumentOutOfRangeException(nameof(length)); + + _ptr = ptr; + _length = length; + } + + /// + /// Returns a representing the unmanaged memory. + /// + /// + /// A starting at the unmanaged memory address + /// and covering bytes. + /// + public override Span GetSpan() => new(_ptr, _length); + + /// + /// Pins the unmanaged memory and returns a handle to it. + /// Since this memory is already unmanaged, pinning is a no-op. + /// + /// An optional offset, in bytes, from the start of the buffer. + /// + /// A pointing directly to the unmanaged buffer + /// at _ptr + elementIndex. + /// + public override MemoryHandle Pin(int elementIndex = 0) => new MemoryHandle(_ptr + elementIndex); + + /// + /// Unpins the memory. This is a no-op because unmanaged memory cannot be moved by the GC. + /// + public override void Unpin() { } + + /// + /// Releases resources used by this . + /// Since this class does not own the unmanaged memory, this method does nothing. + /// + /// + /// if called from ; otherwise . + /// + protected override void Dispose(bool disposing) { } +} diff --git a/Shrike/Writers/FixedBufferWriter.cs b/Shrike/Writers/FixedBufferWriter.cs new file mode 100644 index 0000000..efdf9e2 --- /dev/null +++ b/Shrike/Writers/FixedBufferWriter.cs @@ -0,0 +1,209 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// (var is avoided intentionally in this project so that concrete types are visible at call sites.) +// ReSharper disable always StackAllocInsideLoop +// ReSharper disable always ClassCannotBeInstantiated + +using System.Text; + +#pragma warning disable CA2014 + +namespace Shrike; + +/// +/// A high-performance, *unmanaged* buffer writer designed for scenarios where +/// zero allocations and deterministic memory layout are critical. +/// +/// This struct provides a writable view over a fixed memory region (provided as +/// a raw pointer). It does not own or allocate memory itself +/// — unless the caller provides it one that was manually allocated, in which +/// case can free it if desired. +/// +/// The typical use case is for writing binary or HTTP data directly into a +/// pre-allocated unmanaged buffer (e.g., a native slab per connection) without +/// heap allocations or GC involvement. +/// +[SkipLocalsInit] +public unsafe class FixedBufferWriter : IUnmanagedBufferWriter, IBufferWriter, IDisposable +{ + // ========================================================================= + // Fields + // ========================================================================= + + /// + /// The total capacity (in bytes) of the memory region represented by this writer. + /// + private readonly int _capacity; + + private readonly UnmanagedMemoryManager _manager; + + /// + /// The current read position (if the buffer is also reused for reads). + /// Not used by the writer itself, but exposed for external control. + /// + public int Head; + + /// + /// The current write position. Bytes have been written in [0 .. Tail). + /// + public int Tail; + + /// + /// Pointer to the beginning of the unmanaged buffer. + /// + public byte* Ptr { get; } + + // ========================================================================= + // Constructor + // ========================================================================= + + /// + /// Creates a new instance over an unmanaged + /// memory region. + /// + /// must point to a memory block of at least + /// bytes that remains valid for the lifetime + /// of this struct. + /// + /// Pointer to the start of the unmanaged buffer. + /// Maximum number of bytes writable to the buffer. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public FixedBufferWriter(byte* ptr, int capacity) + { + Ptr = ptr; + _capacity = capacity; + Head = 0; + Tail = 0; + + _manager = new UnmanagedMemoryManager(ptr, capacity); + } + + // ========================================================================= + // Core Methods + // ========================================================================= + + /// + /// Resets both read () and write () + /// indices to zero, effectively clearing the buffer (logically). + /// + /// Does not modify the underlying memory — only the pointers. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Reset() + { + Head = 0; + Tail = 0; + } + + /// + /// Advances the write pointer by bytes after data + /// has been written directly into the memory region. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Advance(int count) + { + //Volatile.Write(ref Tail, Tail + count); + Tail += count; + } + + public Memory GetMemory(int sizeHint = 0) + { + int remaining = _capacity - Tail; + if (sizeHint > remaining) + throw new InvalidOperationException("Buffer too small."); + + return _manager.Memory.Slice(Tail, remaining); + } + + /// + /// Gets a raw unmanaged pointer to the start of the buffer. + /// This is mainly for interop or direct native I/O operations (e.g. send()). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public byte* GetPointer() => Ptr; + + /// + /// Returns a writable over the remaining space in + /// the buffer, starting at the current position. + /// + /// Throws if the requested + /// would exceed the buffer capacity. + /// + /// The minimum required size for the writable region. + public Span GetSpan(int sizeHint = 0) + { + if (Tail + sizeHint > _capacity) + throw new InvalidOperationException("Buffer too small."); + + return new Span(Ptr + Tail, _capacity - Tail); + } + + // ========================================================================= + // Write Helpers + // ========================================================================= + + /// + /// Copies unmanaged data directly into the buffer using a raw pointer copy. + /// Slightly faster than for large spans because it avoids + /// intermediate range checks. + /// + /// The caller must ensure does not overlap the target region. + /// + /// Data to copy into the buffer. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void WriteUnmanaged(ReadOnlySpan source) + { + int len = source.Length; + if (Tail + len > _capacity) + throw new InvalidOperationException("Buffer too small."); + + fixed (byte* src = source) + Buffer.MemoryCopy(src, Ptr + Tail, _capacity - Tail, len); + + Tail += len; + } + + public void WriteUnmanaged(string source) + { + var span = new Span(Ptr + Tail, _capacity - Tail); + var bytesWritten = Encoding.UTF8.GetBytes(source, span); + Tail += bytesWritten; + } + + /// + /// Copies data from a managed into the unmanaged buffer. + /// This version uses which performs bounds checks + /// and is safe for managed callers. + /// + /// The data to copy into the buffer. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Write(ReadOnlySpan source) + { + int len = source.Length; + if (Tail + len > _capacity) + throw new InvalidOperationException("Buffer too small."); + + source.CopyTo(new Span(Ptr + Tail, _capacity - Tail)); + Tail += len; + } + + // ========================================================================= + // Disposal + // ========================================================================= + + /// + /// Releases the unmanaged memory associated with this writer if it owns the pointer. + /// + /// If points to a shared memory region (e.g. part of a + /// connection pool or slab allocator), calling this will free that memory + /// globally — causing use-after-free crashes for other users. + /// + /// Only call this when you know this instance *owns* the buffer and no one else + /// references it. + /// + public void Dispose() + { + if (Ptr != null) + NativeMemory.AlignedFree(Ptr); + } +} \ No newline at end of file diff --git a/Shrike/Writers/ISpanWriter.cs b/Shrike/Writers/ISpanWriter.cs new file mode 100644 index 0000000..6c70fb8 --- /dev/null +++ b/Shrike/Writers/ISpanWriter.cs @@ -0,0 +1,20 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// (var is avoided intentionally in this project so that concrete types are visible at call sites.) +// ReSharper disable always StackAllocInsideLoop +// ReSharper disable always ClassCannotBeInstantiated +#pragma warning disable CA2014 + +namespace Shrike; + +internal interface ISpanWriter +{ + /// Notifies the that data items were written to the output or . + /// The number of data items written to the . + void Advance(int count); + + /// Returns a to write to that is at least the requested size (specified by ). + /// The minimum length of the returned . If 0, a non-empty buffer is returned. + /// A of at least the size . If is 0, returns a non-empty buffer. + Span GetSpan(int sizeHint = 0); +} \ No newline at end of file diff --git a/Shrike/Writers/IUnmanagedBufferWriter.cs b/Shrike/Writers/IUnmanagedBufferWriter.cs new file mode 100644 index 0000000..b8e2187 --- /dev/null +++ b/Shrike/Writers/IUnmanagedBufferWriter.cs @@ -0,0 +1,40 @@ +// ReSharper disable always CheckNamespace +// ReSharper disable always SuggestVarOrType_BuiltInTypes +// (var is avoided intentionally in this project so that concrete types are visible at call sites.) +// ReSharper disable always StackAllocInsideLoop +// ReSharper disable always ClassCannotBeInstantiated +#pragma warning disable CA2014 + +namespace Shrike; + +/// +/// Minimal contract for writing to a caller-provided unmanaged, contiguous buffer. +/// Intended for high-performance I/O/serialization without GC pinning. +/// +/// Unmanaged element type (e.g., byte). +internal unsafe interface IUnmanagedBufferWriter where T : unmanaged +{ + /// + /// Advance the logical write cursor by elements + /// after data was written directly into the buffer. + /// + void Advance(int count); + + /// + /// Base pointer to the buffer. Valid only while the writer is alive. + /// Callers that write via this pointer must also call . + /// + T* GetPointer(); + + /// + /// Copy into the buffer and advance the cursor. + /// Throw if it would exceed capacity. + /// + void Write(ReadOnlySpan source); + + /// + /// Like , but allows an implementation + /// to use an unsafe/fast path (e.g., MemoryCopy). Must still enforce bounds. + /// + void WriteUnmanaged(ReadOnlySpan source); +} diff --git a/Shrike/_usings.cs b/Shrike/_usings.cs new file mode 100644 index 0000000..62ad342 --- /dev/null +++ b/Shrike/_usings.cs @@ -0,0 +1,15 @@ +global using System; +global using System.Buffers; +global using System.Buffers.Binary; +global using System.Buffers.Text; +global using System.Collections.Concurrent; +global using System.Runtime.CompilerServices; +global using System.Runtime.InteropServices; +global using System.Text.Json.Serialization; + +global using Microsoft.Extensions.ObjectPool; + +global using static Shrike.ProcessorArchDependant; +global using static Shrike.Native; +global using static Shrike.HeaderParsing; +global using static Shrike.HashUtils; \ No newline at end of file diff --git a/Spring.Demo/Program.cs b/Spring.Demo/Program.cs new file mode 100644 index 0000000..16ff3d1 --- /dev/null +++ b/Spring.Demo/Program.cs @@ -0,0 +1,23 @@ +using Microsoft.Extensions.Logging; +using Spring; + +var builder = WebApplication.CreateBuilder(args); + +builder.Logging.SetMinimumLevel(LogLevel.Warning); // benchmark: silence per-request logs + +builder.WebHost.UseKestrel(kestrel => +{ + kestrel.ListenAnyIP(8080); +}); + +// SPRING=0 → Kestrel's default Socket transport (baseline). Otherwise the io_uring Spring transport. +if (Environment.GetEnvironmentVariable("SPRING") != "0") +{ + builder.WebHost.UseSpring(opts => opts.ReactorCount = Math.Max(1, 12)); +} + +var app = builder.Build(); + +app.MapGet("/", () => "Hello from Spring + Kestrel\n"); + +app.Run(); diff --git a/Spring.Demo/Spring.Demo.csproj b/Spring.Demo/Spring.Demo.csproj new file mode 100644 index 0000000..89ff302 --- /dev/null +++ b/Spring.Demo/Spring.Demo.csproj @@ -0,0 +1,15 @@ + + + + net10.0 + enable + enable + true + true + + + + + + + diff --git a/Spring/Connection/Connection.Incremental.cs b/Spring/Connection/Connection.Incremental.cs new file mode 100644 index 0000000..34ba733 --- /dev/null +++ b/Spring/Connection/Connection.Incremental.cs @@ -0,0 +1,61 @@ +using System.Runtime.InteropServices; +using Spring.Utils; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Spring; + +/// +/// Incremental-mode (IOU_PBUF_RING_INC) per-connection buffer-ring state. +/// Each connection owns its own ring + slab; one buffer accumulates this +/// connection's byte stream across many recvs. The reactor (Reactor.Incremental) +/// drives setup/teardown and the refcounted recycle; this partial just holds the +/// state and routes a handler return to the right reactor entry point. +/// +/// All of these stay allocated across pool reuse and are freed in Dispose(). +/// +public sealed unsafe partial class Connection +{ + internal byte* BufRing; // kernel-shared ring control area + internal byte* BufSlab; // this connection's recv slab + internal ushort Bgid; + internal uint BufRingMask; + internal int BufRingEntries; + internal bool IncrementalMode; + + internal int[]? CumOffset; // per-bid: byte offset where the next slice begins + internal int[]? RefCount; // per-bid: outstanding handler refs + internal bool[]? KernelDone; // per-bid: kernel finished appending (no F_BUF_MORE) + + internal int Generation => Volatile.Read(ref _generation); + + /// + /// Called by the handler to hand a consumed recv buffer back. Routes by mode: + /// incremental returns carry (fd, gen, bid) for refcounted recycle; the shared + /// path returns the bare bid to the reactor's single buf_ring. + /// + public void ReturnBuffer(in SpscRecvRing.Item item) + { + if (IncrementalMode) + { + _reactor.EnqueueReturnQIncremental(ClientFd, item.Gen, item.Bid); + } + else + { + _reactor.EnqueueReturnQ(item.Bid); + } + } + + private void DisposeIncremental() + { + if (BufRing != null) + { + NativeMemory.AlignedFree(BufRing); + BufRing = null; + } + if (BufSlab != null) + { + NativeMemory.AlignedFree(BufSlab); + BufSlab = null; + } + } +} diff --git a/Spring/Connection/Connection.InputPipe.cs b/Spring/Connection/Connection.InputPipe.cs new file mode 100644 index 0000000..698d9d5 --- /dev/null +++ b/Spring/Connection/Connection.InputPipe.cs @@ -0,0 +1,45 @@ +using System.IO.Pipelines; + +namespace Spring; + +/// +/// Kestrel-mode input path. The reactor copies recv bytes into a real BCL +/// and Kestrel reads InputPipe.Reader — bypassing the +/// hand-rolled read IVTS, which can't take Kestrel's concurrent off-reactor +/// access. Output still uses the write slab + FlushAsync (the single-issuer +/// EnqueueFlush handoff to the reactor). Null on the raw path. +/// +public sealed unsafe partial class Connection +{ + internal Pipe? InputPipe; + + internal void InitInputPipe() + => InputPipe = new Pipe(new PipeOptions( + pauseWriterThreshold: 0, + resumeWriterThreshold: 0, + useSynchronizationContext: false)); + + /// Reactor-thread: copy recv bytes into the pipe and publish. + internal void FeedInput(byte* ptr, int len) + { + Span dst = InputPipe!.Writer.GetSpan(len); + new ReadOnlySpan(ptr, len).CopyTo(dst); + InputPipe.Writer.Advance(len); + _ = InputPipe.Writer.FlushAsync(); // no backpressure → completes synchronously + } + + /// Reactor-thread: signal EOF to Kestrel's reader. + internal void CompleteInput(Exception? error = null) + => InputPipe?.Writer.Complete(error); + + /// + /// Resume read/flush continuations on the thread pool. Kestrel drives the + /// connection off-reactor, so the reactor's CompleteFlush must NOT run + /// Kestrel inline. Call before the first FlushAsync. + /// + public void UseAsyncContinuations() + { + _readSignal.RunContinuationsAsynchronously = true; + _flushSignal.RunContinuationsAsynchronously = true; + } +} diff --git a/Spring/Connection/Connection.Read.cs b/Spring/Connection/Connection.Read.cs new file mode 100644 index 0000000..d09c190 --- /dev/null +++ b/Spring/Connection/Connection.Read.cs @@ -0,0 +1,163 @@ +using System.Threading.Tasks.Sources; +using Spring.Utils; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Spring; + +/// +/// Per-connection state. The handler may run on any thread (e.g. resumed by +/// a thread-pool timer); reactor-only side effects are funnelled through the +/// MPSC queues on `Reactor`. Coordination uses Interlocked.Exchange on the +/// arm flags and a sticky `_pending` to close the lost-wakeup race. +/// +/// Lifetime is pool-managed: the reactor pops a Connection on accept (or new +/// one if pool is empty), and pushes it back on teardown after `Clear()`. The +/// `_generation` field is bumped on each `Clear` so stale `ValueTask` tokens +/// from a previous connection life are detectable and return `Closed()` +/// instead of leaking the new tenant's state. +/// +public sealed unsafe partial class Connection : IValueTaskSource +{ + internal Connection SetFd(int fd) + { + ClientFd = fd; + return this; + } + + private ManualResetValueTaskSourceCore _readSignal; + private int _armed; + private int _pending; + private int _closed; + + private readonly SpscRecvRing _recv = new(capacityPow2: 16); + + public ValueTask ReadAsync() + { + if (!_recv.IsEmpty() || Volatile.Read(ref _pending) == 1) + { + Volatile.Write(ref _pending, 0); + return new ValueTask( + new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + + if (Volatile.Read(ref _closed) != 0) + { + return new ValueTask(RecvSnapshot.Closed()); + } + + if (Interlocked.Exchange(ref _armed, 1) == 1) + { + throw new InvalidOperationException("ReadAsync already armed."); + } + + // Snapshot the generation as the IVTS token so a future Clear() can + // invalidate this awaiter if the connection gets pool-recycled. + int gen = Volatile.Read(ref _generation); + + // Race recovery: re-check between arming and returning the IVTS task. + if (!_recv.IsEmpty() || Volatile.Read(ref _pending) == 1 || Volatile.Read(ref _closed) != 0) + { + Volatile.Write(ref _pending, 0); + Interlocked.Exchange(ref _armed, 0); + + return new ValueTask( + new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + + return new ValueTask(this, (short)gen); + } + + public bool TryGetItem(in RecvSnapshot snap, out SpscRecvRing.Item item) + => _recv.TryDequeueUntil(snap.Tail, out item); + + public void ResetRead() => _readSignal.Reset(); + + public void Complete(int res, ushort bid, bool hasBuffer, byte* ptr) + { + if (!_recv.TryEnqueue(new SpscRecvRing.Item + { + Ptr = ptr, + Bid = bid, + Len = res, + HasBuffer = hasBuffer, + Gen = (ushort)Volatile.Read(ref _generation) + })) + { + Console.Error.WriteLine("[conn] recv queue overflow."); + if (hasBuffer) + { + _reactor.ReturnBufferDirect(bid); + } + Volatile.Write(ref _closed, 1); + } + + if (Interlocked.Exchange(ref _armed, 0) == 1) + { + _readSignal.SetResult(new RecvSnapshot(_recv.SnapshotTail(), Volatile.Read(ref _closed) != 0)); + } + else + { + Volatile.Write(ref _pending, 1); + } + } + + internal void DrainRecv() + { + // Return any buffer IDs still sitting in the SPSC ring (handler exited + // before draining them, or a recv arrived after _closed was set). + while (_recv.TryDequeue(out SpscRecvRing.Item item)) + { + if (item.HasBuffer) + { + _reactor.ReturnBufferDirect(item.Bid); + } + } + } + + // ========================================================================= + // IValueTaskSource plumbing — token (= snapshot of `_generation` at await + // time) is compared against the current `_generation` to detect stale + // awaiters from before a Clear()/pool reuse. Stale awaiters get a + // sentinel result rather than the new tenant's state. + // + // For the actual IVTS dispatch we pass `_readSignal.Version` / + // `_flushSignal.Version` to the underlying core (not `token`) because the + // core's version is bumped by ResetRead/CompleteFlush mid-life and is + // unrelated to the cross-life generation guard. + // ========================================================================= + + RecvSnapshot IValueTaskSource.GetResult(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return RecvSnapshot.Closed(); + } + + return _readSignal.GetResult(_readSignal.Version); + } + + ValueTaskSourceStatus IValueTaskSource.GetStatus(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return ValueTaskSourceStatus.Succeeded; + } + + return _readSignal.GetStatus(_readSignal.Version); + } + + void IValueTaskSource.OnCompleted(Action continuation, object? state, short token, ValueTaskSourceOnCompletedFlags flags) + { + if (token != (short)Volatile.Read(ref _generation)) + { + // Stale — run the continuation now so the awaiter unblocks and + // gets RecvSnapshot.Closed() from GetResult. + continuation(state); + + return; + } + + _readSignal.OnCompleted(continuation, state, _readSignal.Version, flags); + } +} diff --git a/Spring/Connection/Connection.Write.cs b/Spring/Connection/Connection.Write.cs new file mode 100644 index 0000000..06025b6 --- /dev/null +++ b/Spring/Connection/Connection.Write.cs @@ -0,0 +1,170 @@ +using System.Buffers; +using System.Threading.Tasks.Sources; +using Spring.Utils; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Spring; + +public sealed unsafe partial class Connection : IValueTaskSource, IBufferWriter +{ + private readonly int _writeSlabSize; + internal byte* WriteBuffer; + internal int WriteHead; + internal int WriteTail; + internal int WriteInFlight; + + private readonly UnmanagedMemoryManager _manager; + + private ManualResetValueTaskSourceCore _flushSignal = new() + { + RunContinuationsAsynchronously = false, + }; + private int _flushArmed; + private int _flushInProgress; + + // IBufferWrite +#region IBufferWrite + + public Memory GetMemory(int sizeHint = 0) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + int remaining = _writeSlabSize - WriteTail; + if (sizeHint > remaining) + { + throw new InvalidOperationException("Buffer too small."); + } + + return _manager.Memory.Slice(WriteTail, remaining); + } + + public Span GetSpan(int sizeHint = 0) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + if (WriteTail + sizeHint > _writeSlabSize) + { + throw new InvalidOperationException("Write buffer too small."); + } + + return new Span(WriteBuffer + WriteTail, _writeSlabSize - WriteTail); + } + + public void Advance(int count) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + WriteTail += count; + } + +#endregion + + // Write to the inner buffer + public void Write(ReadOnlySpan source) + { + if (Volatile.Read(ref _flushInProgress) != 0) + { + throw new InvalidOperationException("Cannot write while flush is in progress."); + } + + int len = source.Length; + if (WriteTail + len > _writeSlabSize) + { + throw new InvalidOperationException("Write buffer too small."); + } + + source.CopyTo(new Span(WriteBuffer + WriteTail, len)); + WriteTail += len; + } + + // Flush inner buffer data to the kernel + public ValueTask FlushAsync() + { + if (Interlocked.Exchange(ref _flushInProgress, 1) == 1) + { + throw new InvalidOperationException("FlushAsync already in progress."); + } + + int target = WriteTail; + if (target == 0) + { + Volatile.Write(ref _flushInProgress, 0); + return default; + } + + if (Interlocked.Exchange(ref _flushArmed, 1) == 1) + { + throw new InvalidOperationException("FlushAsync already armed."); + } + + _flushSignal.Reset(); + WriteInFlight = target; + + int gen = Volatile.Read(ref _generation); + + // Send through io_uring (like Minima): hand the connection to the reactor — + // the ring's single issuer — via EnqueueFlush, which becomes an IORING_OP_SEND + // on the reactor thread; the send CQE drives CompleteFlush. + _reactor.EnqueueFlush(ClientFd); + + return new ValueTask(this, (short)gen); + } + + // Signal the FlushAsync was completed, called by the reactor's dispatcher send branch + internal void CompleteFlush() + { + WriteHead = 0; + WriteTail = 0; + WriteInFlight = 0; + Volatile.Write(ref _flushInProgress, 0); + Interlocked.Exchange(ref _flushArmed, 0); + + _flushSignal.SetResult(true); + } + + // IValueTaskSource +#region IValueTaskSource + + void IValueTaskSource.GetResult(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return; + } + + _flushSignal.GetResult(_flushSignal.Version); + } + + ValueTaskSourceStatus IValueTaskSource.GetStatus(short token) + { + if (token != (short)Volatile.Read(ref _generation)) + { + return ValueTaskSourceStatus.Succeeded; + } + + return _flushSignal.GetStatus(_flushSignal.Version); + } + + void IValueTaskSource.OnCompleted(Action continuation, object? state, short token, ValueTaskSourceOnCompletedFlags flags) + { + if (token != (short)Volatile.Read(ref _generation)) + { + continuation(state); + + return; + } + _flushSignal.OnCompleted(continuation, state, _flushSignal.Version, flags); + } + +#endregion +} \ No newline at end of file diff --git a/Spring/Connection/Connection.cs b/Spring/Connection/Connection.cs new file mode 100644 index 0000000..92b3970 --- /dev/null +++ b/Spring/Connection/Connection.cs @@ -0,0 +1,99 @@ +using System.Runtime.InteropServices; +using Spring.Utils; + +namespace Spring; + +public sealed unsafe partial class Connection +{ + private readonly Reactor _reactor; + + public int ClientFd { get; private set; } + + // Bumped on Clear(); the low 16 bits are used as the IVTS token so stale + // awaiters can be detected after pool reuse. + private int _generation; + + public Connection(Reactor reactor, int fd, int writeSlabSize = 1024 * 16) + { + _reactor = reactor; + ClientFd = fd; + _writeSlabSize = writeSlabSize; + WriteBuffer = (byte*)NativeMemory.AlignedAlloc((nuint)writeSlabSize, 64); + + _manager = new UnmanagedMemoryManager(WriteBuffer, writeSlabSize); + } + + // ========================================================================= + // Pool lifecycle — invoked from Reactor.Dispatch's recv/send error paths. + // Reactor-thread only. + // + // teardown: MarkClosed() → wake awaiters with closed=1 + // DrainRecv() → return any in-flight buf_ring items + // close(fd) + // Clear() → reset state, bump _generation + // push to pool, OR Dispose() if pool is full + // ========================================================================= + + public void MarkClosed() + { + Volatile.Write(ref _closed, 1); + + if (Interlocked.Exchange(ref _armed, 0) == 1) + { + _readSignal.SetResult(new RecvSnapshot(_recv.SnapshotTail(), isClosed: true)); + } + else + { + Volatile.Write(ref _pending, 1); + } + + if (Interlocked.Exchange(ref _flushArmed, 0) == 1) + { + Volatile.Write(ref _flushInProgress, 0); + _flushSignal.SetResult(true); + } + } + + internal void Clear() + { + // Bump generation first — readers of IVTS plumbing observe this via + // Volatile.Read and stale tokens get RecvSnapshot.Closed() / no-op. + Interlocked.Increment(ref _generation); + + Volatile.Write(ref _armed, 0); + Volatile.Write(ref _pending, 0); + Volatile.Write(ref _closed, 0); + Volatile.Write(ref _flushArmed, 0); + Volatile.Write(ref _flushInProgress, 0); + + WriteHead = 0; + WriteTail = 0; + WriteInFlight = 0; + + _readSignal.Reset(); + _flushSignal.Reset(); + + _recv.Reset(); // discard any leftover SPSC items + IncrementalMode = false; // per-conn ring (if any) was torn down before Clear + } + + public void Dispose() + { + FreeNative(); + GC.SuppressFinalize(this); + } + + // Kestrel mode drops connections (no pool) — the finalizer frees native + // memory. By GC time the fd is long closed and no send references the slab. + ~Connection() => FreeNative(); + + private void FreeNative() + { + if (WriteBuffer != null) + { + NativeMemory.AlignedFree(WriteBuffer); + WriteBuffer = null; + } + DisposeIncremental(); + } +} \ No newline at end of file diff --git a/Spring/Connection/ConnectionDualPipe.cs b/Spring/Connection/ConnectionDualPipe.cs new file mode 100644 index 0000000..68adcb4 --- /dev/null +++ b/Spring/Connection/ConnectionDualPipe.cs @@ -0,0 +1,19 @@ +using System.IO.Pipelines; + +namespace Spring; + +public sealed class ConnectionDualPipe : IDuplexPipe +{ + public PipeReader Input { get; } + public PipeWriter Output { get; } + + public ConnectionDualPipe(Connection connection) + { + ArgumentNullException.ThrowIfNull(connection); + // Kestrel mode: read through the BCL Pipe the reactor feeds. Raw mode: the IVTS reader. + Input = connection.InputPipe is { } pipe + ? pipe.Reader + : new ConnectionPipeReader(connection); + Output = new ConnectionPipeWriter(connection); + } +} \ No newline at end of file diff --git a/Spring/Connection/ConnectionPipeReader.cs b/Spring/Connection/ConnectionPipeReader.cs new file mode 100644 index 0000000..aa8b4f6 --- /dev/null +++ b/Spring/Connection/ConnectionPipeReader.cs @@ -0,0 +1,181 @@ +using System.Buffers; +using System.IO.Pipelines; +using Spring.Utils; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Spring; + +/// +/// Adapts Spring's raw read API (ReadAsync + TryGetItem +/// + ReturnBuffer) to a standard . Recv buffers are +/// exposed zero-copy as a ReadOnlySequence<byte> (one segment per buffer) +/// and held until AdvanceTo consumes them, at which point fully-consumed buffers +/// are returned to the reactor. +/// +/// Convenience/compat layer for PipeReader consumers — the raw ReadAsync/ +/// TryGetItem path stays the faster one (this adds held-buffer + sequence +/// bookkeeping per read). +/// +public sealed class ConnectionPipeReader : PipeReader +{ + private readonly Connection _conn; + private readonly List _held = new(16); + private ReadOnlySequence _lastSequence; + + private bool _completed; + private bool _cancelRequested; + private bool _connectionClosed; + + private readonly struct Held + { + public readonly ReadOnlyMemory Memory; + public readonly SpscRecvRing.Item Item; + + public Held(ReadOnlyMemory memory, SpscRecvRing.Item item) + { + Memory = memory; + Item = item; + } + + public Held WithMemory(ReadOnlyMemory memory) => new(memory, Item); + } + + public ConnectionPipeReader(Connection connection) + { + _conn = connection ?? throw new ArgumentNullException(nameof(connection)); + } + + public override async ValueTask ReadAsync(CancellationToken cancellationToken = default) + { + ThrowIfCompleted(); + + if (_cancelRequested) + { + _cancelRequested = false; + return new ReadResult(BuildSequence(), isCanceled: true, isCompleted: _connectionClosed); + } + + // Anything still held from a previous read that wasn't fully consumed. + if (_held.Count > 0) + return new ReadResult(BuildSequence(), isCanceled: false, isCompleted: _connectionClosed); + + if (_connectionClosed) + return new ReadResult(default, isCanceled: false, isCompleted: true); + + RecvSnapshot snap = await _conn.ReadAsync(); + + while (_conn.TryGetItem(snap, out SpscRecvRing.Item item)) + { + if (item.HasBuffer) + _held.Add(new Held(item.AsMemoryManager().Memory, item)); + } + + _conn.ResetRead(); + + if (snap.IsClosed) + _connectionClosed = true; + + if (_cancelRequested) + { + _cancelRequested = false; + return new ReadResult(BuildSequence(), isCanceled: true, isCompleted: _connectionClosed); + } + + return new ReadResult(BuildSequence(), isCanceled: false, isCompleted: _connectionClosed); + } + + public override bool TryRead(out ReadResult result) + { + ThrowIfCompleted(); + + if (_held.Count > 0) + { + result = new ReadResult(BuildSequence(), isCanceled: false, isCompleted: _connectionClosed); + return true; + } + + if (_connectionClosed) + { + result = new ReadResult(default, isCanceled: false, isCompleted: true); + return true; + } + + result = default; + return false; + } + + public override void AdvanceTo(SequencePosition consumed) => AdvanceTo(consumed, consumed); + + public override void AdvanceTo(SequencePosition consumed, SequencePosition examined) + { + if (_held.Count == 0) + return; + + long consumedBytes = _lastSequence.Slice(0, consumed).Length; + + while (_held.Count > 0 && consumedBytes > 0) + { + Held seg = _held[0]; + int available = seg.Memory.Length; + + if (consumedBytes >= available) + { + // Whole buffer consumed — return it to the reactor. + _conn.ReturnBuffer(seg.Item); + _held.RemoveAt(0); + consumedBytes -= available; + } + else + { + // Partial — keep the unconsumed tail of this buffer. + _held[0] = seg.WithMemory(seg.Memory[(int)consumedBytes..]); + consumedBytes = 0; + } + } + } + + public override void CancelPendingRead() => _cancelRequested = true; + + public override void Complete(Exception? exception = null) + { + if (_completed) + return; + + _completed = true; + + for (int i = 0; i < _held.Count; i++) + _conn.ReturnBuffer(_held[i].Item); + + _held.Clear(); + } + + private ReadOnlySequence BuildSequence() + { + if (_held.Count == 0) + { + _lastSequence = default; + return _lastSequence; + } + + if (_held.Count == 1) + { + _lastSequence = new ReadOnlySequence(_held[0].Memory); + return _lastSequence; + } + + var head = new RingSegment(_held[0].Memory, _held[0].Item.Bid); + RingSegment tail = head; + + for (int i = 1; i < _held.Count; i++) + tail = tail.Append(_held[i].Memory, _held[i].Item.Bid); + + _lastSequence = new ReadOnlySequence(head, 0, tail, tail.Memory.Length); + return _lastSequence; + } + + private void ThrowIfCompleted() + { + if (_completed) + throw new InvalidOperationException("Reading is not allowed after the reader was completed."); + } +} diff --git a/Spring/Connection/ConnectionPipeWriter.cs b/Spring/Connection/ConnectionPipeWriter.cs new file mode 100644 index 0000000..a41b444 --- /dev/null +++ b/Spring/Connection/ConnectionPipeWriter.cs @@ -0,0 +1,63 @@ +using System.IO.Pipelines; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Spring; + +/// +/// Adapts Spring's write API (GetMemory/GetSpan/Advance/ +/// FlushAsync) to a standard , so PipeWriter-based code +/// can write responses through the connection's per-connection slab. +/// A thin wrapper — all the work lives in Connection. +/// +public sealed class ConnectionPipeWriter : PipeWriter +{ + private readonly Connection _conn; + private bool _completed; + private bool _cancelRequested; + private long _unflushed; + + public ConnectionPipeWriter(Connection connection) + { + _conn = connection ?? throw new ArgumentNullException(nameof(connection)); + } + + public override bool CanGetUnflushedBytes => true; + public override long UnflushedBytes => _unflushed; + + public override Memory GetMemory(int sizeHint = 0) => _conn.GetMemory(sizeHint); + + public override Span GetSpan(int sizeHint = 0) => _conn.GetSpan(sizeHint); + + public override void Advance(int bytes) + { + _unflushed += bytes; + _conn.Advance(bytes); + } + + public override ValueTask FlushAsync(CancellationToken cancellationToken = default) + { + if (_cancelRequested) + { + _cancelRequested = false; + return new ValueTask(new FlushResult(isCanceled: true, isCompleted: _completed)); + } + + _unflushed = 0; + ValueTask inner = _conn.FlushAsync(); + + if (inner.IsCompletedSuccessfully) + return new ValueTask(new FlushResult(isCanceled: false, isCompleted: _completed)); + + return AwaitFlush(inner); + } + + private async ValueTask AwaitFlush(ValueTask inner) + { + await inner; + return new FlushResult(isCanceled: false, isCompleted: _completed); + } + + public override void CancelPendingFlush() => _cancelRequested = true; + + public override void Complete(Exception? exception = null) => _completed = true; +} diff --git a/Spring/Connection/RecvSnapshot.cs b/Spring/Connection/RecvSnapshot.cs new file mode 100644 index 0000000..1a22084 --- /dev/null +++ b/Spring/Connection/RecvSnapshot.cs @@ -0,0 +1,15 @@ +namespace Spring; + +public readonly struct RecvSnapshot +{ + public readonly long Tail; + public readonly bool IsClosed; + + public RecvSnapshot(long tail, bool isClosed) + { + Tail = tail; + IsClosed = isClosed; + } + + public static RecvSnapshot Closed() => new(0, isClosed: true); +} \ No newline at end of file diff --git a/Spring/Reactor/Reactor.Incremental.cs b/Spring/Reactor/Reactor.Incremental.cs new file mode 100644 index 0000000..6a8c0a7 --- /dev/null +++ b/Spring/Reactor/Reactor.Incremental.cs @@ -0,0 +1,300 @@ +using System.Runtime.InteropServices; +using Spring.Utils; +using static Spring.Native; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Spring; + +/// +/// Incremental-buffer (IOU_PBUF_RING_INC) path. Each connection gets its own +/// buffer ring: one buffer accumulates that connection's byte stream across many +/// recvs, so buffers are recycled only when the kernel is done appending AND the +/// handler has returned every slice it was handed. Selected per reactor by the +/// `_incremental` flag; the shared-ring path in Reactor.cs is untouched. +/// +public sealed unsafe partial class Reactor +{ + private Stack? _freeGids; + private Mpsc? _returnQInc; + + private void InitIncremental() + { + // Per-connection rings; no shared ring. GID 1 reserved; per-conn GIDs 2..MaxConnections+1. + _freeGids = new Stack(MaxConnections); + for (int g = MaxConnections + 1; g >= 2; g--) + _freeGids.Push((ushort)g); + + _returnQInc = new Mpsc(1 << 16); + } + + private ushort AllocGid() => _freeGids!.Pop(); + private void FreeGid(ushort gid) => _freeGids!.Push(gid); + + // ========================================================================= + // Per-connection ring lifecycle + // ========================================================================= + + private void SetupConnectionBufRing(Connection conn) + { + ushort gid = AllocGid(); + int entries = ConnBufRingEntries; + + // Ring control area + slab + tracking arrays are allocated once and + // reused across pool lives; only the kernel registration is per-life. + if (conn.BufRing == null) + conn.BufRing = (byte*)NativeMemory.AlignedAlloc((nuint)entries * 16, 4096); + NativeMemory.Clear(conn.BufRing, (nuint)entries * 16); + + if (conn.BufSlab == null) + conn.BufSlab = (byte*)NativeMemory.AlignedAlloc((nuint)entries * (nuint)IncRecvBufferSize, 64); + + conn.CumOffset ??= new int[entries]; + conn.RefCount ??= new int[entries]; + conn.KernelDone ??= new bool[entries]; + Array.Clear(conn.CumOffset, 0, entries); + Array.Clear(conn.RefCount, 0, entries); + Array.Clear(conn.KernelDone, 0, entries); + + var reg = new io_uring_buf_reg + { + ring_addr = (ulong)conn.BufRing, + ring_entries = (uint)entries, + bgid = gid, + flags = IOU_PBUF_RING_INC, + }; + int ret = io_uring_register(Ring.Fd, IORING_REGISTER_PBUF_RING, ®, 1); + if (ret < 0) + throw new InvalidOperationException($"register pbuf_ring (inc) failed: ret={ret} gid={gid}"); + + conn.Bgid = gid; + conn.BufRingEntries = entries; + conn.BufRingMask = (uint)(entries - 1); + conn.IncrementalMode = true; + + for (ushort bid = 0; bid < entries; bid++) + { + byte* slot = conn.BufRing + (uint)bid * 16; + *(ulong*)(slot + 0) = (ulong)(conn.BufSlab + bid * (nuint)IncRecvBufferSize); + *(uint*)(slot + 8) = IncRecvBufferSize; + *(ushort*)(slot + 12) = bid; + } + Volatile.Write(ref *(ushort*)(conn.BufRing + 14), (ushort)entries); + } + + private void TeardownConnectionBufRing(Connection conn) + { + if (conn.IncrementalMode) + { + var reg = new io_uring_buf_reg { bgid = conn.Bgid }; + io_uring_register(Ring.Fd, IORING_UNREGISTER_PBUF_RING, ®, 1); + FreeGid(conn.Bgid); + } + // BufRing / BufSlab / arrays stay allocated for pool reuse. + } + + // Re-add a fully-consumed buffer to its connection's ring (reactor-thread only). + private void ReturnConnectionBuffer(Connection conn, ushort bid) + { + conn.CumOffset![bid] = 0; + conn.RefCount![bid] = 0; + conn.KernelDone![bid] = false; + + ushort tail = Volatile.Read(ref *(ushort*)(conn.BufRing + 14)); + byte* slot = conn.BufRing + (tail & conn.BufRingMask) * 16; + *(ulong*)(slot + 0) = (ulong)(conn.BufSlab + bid * (nuint)IncRecvBufferSize); + *(uint*)(slot + 8) = IncRecvBufferSize; + *(ushort*)(slot + 12) = bid; + Volatile.Write(ref *(ushort*)(conn.BufRing + 14), (ushort)(tail + 1)); + } + + // ========================================================================= + // Refcounted return path (handler → reactor), carrying (fd, gen, bid) + // ========================================================================= + + // (fd, gen, bid) packed into one ulong for the incremental return queue: + // fd in the high 32 bits, gen in the next 16, bid in the low 16. + private static ulong PackReturn(int fd, ushort gen, ushort bid) + => ((ulong)(uint)fd << 32) | ((ulong)gen << 16) | bid; + + private static void UnpackReturn(ulong packed, out int fd, out ushort gen, out ushort bid) + { + fd = (int)(packed >> 32); + gen = (ushort)((packed >> 16) & 0xFFFF); + bid = (ushort)(packed & 0xFFFF); + } + + public void EnqueueReturnQIncremental(int fd, ushort gen, ushort bid) + { + // Fast path: caller is the reactor thread (handler resumed inline). + if (Environment.CurrentManagedThreadId == _reactorThreadId) + { + ApplyReturnIncremental(fd, gen, bid); + return; + } + ulong packed = PackReturn(fd, gen, bid); + SpinWait sw = default; + while (!_returnQInc!.TryEnqueue(packed)) + sw.SpinOnce(); + WakeFdWrite(); + } + + private void DrainReturnQIncremental() + { + while (_returnQInc!.TryDequeue(out ulong packed)) + { + UnpackReturn(packed, out int fd, out ushort gen, out ushort bid); + ApplyReturnIncremental(fd, gen, bid); + } + } + + private void ApplyReturnIncremental(int fd, ushort gen, ushort bid) + { + if (!Connections.TryGetValue(fd, out var conn) || !conn.IncrementalMode) + { + return; // fd gone / ring already torn down + } + if ((ushort)conn.Generation != gen) + { + return; // stale return from a previous life (fd reused) + } + + conn.RefCount![bid]--; + if (conn.RefCount[bid] <= 0 && conn.KernelDone![bid]) + { + ReturnConnectionBuffer(conn, bid); + } + } + + // ========================================================================= + // Incremental reactor loop + // ========================================================================= + + private void LoopIncremental() + { + while (true) + { + DrainReturnQIncremental(); + DrainFlushQ(); + + int rc = Ring.SubmitAndWait(1); + if (rc < 0 && rc != -EINTR && rc != -EAGAIN && rc != -EBUSY) + { + Console.Error.WriteLine($"[r{Id}] io_uring_enter failed: {rc}"); + + break; + } + + uint ready = Ring.CqReady(); + for (uint i = 0; i < ready; i++) + { + DispatchIncremental(in Ring.CqeAt(i)); + } + Ring.CqAdvance(ready); + } + } + + private void DispatchIncremental(in IoUringCqe cqe) + { + ulong kind = cqe.user_data & 0xffffffff_00000000UL; + int fd = (int)(cqe.user_data & 0xffffffffUL); + bool more = (cqe.flags & IORING_CQE_F_MORE) != 0; + + if (kind == KindWake) + { + ulong drain; + read(_wakeFd, &drain, 8); + if (!more) + { + ArmWakePoll(); + } + return; + } + + if (kind == KindAccept) + { + if (cqe.res >= 0) + { + int clientFd = cqe.res; + SetNoDelay(clientFd); + Connection conn = _pool.TryPop(out var pooled) + ? pooled.SetFd(clientFd) + : new Connection(this, clientFd, _config.WriteSlabSize); + Connections[clientFd] = conn; + SetupConnectionBufRing(conn); + SubmitRecvMultishot(clientFd, conn.Bgid); + + OnAccept?.Invoke(conn); // NOTE: incremental + Kestrel not wired for v1 (use Incremental=false) + } + else + { + Console.Error.WriteLine($"[r{Id}] accept error: {cqe.res}"); + } + if (!more) + { + SubmitAcceptMultishot(); + } + } + else if (kind == KindRecv) + { + bool hasBuf = (cqe.flags & IORING_CQE_F_BUFFER) != 0; + bool bufMore = (cqe.flags & IORING_CQE_F_BUF_MORE) != 0; + ushort bid = hasBuf ? (ushort)(cqe.flags >> IORING_CQE_BUFFER_SHIFT) : (ushort)0; + + if (cqe.res <= 0) + { + // Peer EOF / recv error — the whole per-conn ring is freed in Recycle. + if (Connections.Remove(fd, out var dyingConn)) + { + Recycle(dyingConn, fd); + } + + return; + } + + if (!Connections.TryGetValue(fd, out var conn)) + { + return; // straggler for a connection whose ring is already gone + } + + // Data lands at the buffer's running offset; the kernel keeps + // appending to this bid until the buffer is full (F_BUF_MORE clear). + byte* ptr = conn.BufSlab + (nuint)bid * (nuint)IncRecvBufferSize + (nuint)conn.CumOffset![bid]; + conn.CumOffset[bid] += cqe.res; + conn.RefCount![bid]++; + if (!bufMore || !more) + { + conn.KernelDone![bid] = true; + } + + conn.Complete(cqe.res, bid, hasBuffer: true, ptr); + + if (!more) + { + SubmitRecvMultishot(fd, conn.Bgid); + } + } + else if (kind == KindSend) + { + if (!Connections.TryGetValue(fd, out var conn)) + { + return; + } + if (cqe.res <= 0) + { + Connections.Remove(fd); + Recycle(conn, fd); + + return; + } + conn.WriteHead += cqe.res; + if (conn.WriteHead < conn.WriteInFlight) + { + SubmitSend(fd, conn.WriteBuffer + conn.WriteHead, (uint)(conn.WriteInFlight - conn.WriteHead)); + + return; + } + + conn.CompleteFlush(); + } + } +} diff --git a/Spring/Reactor/Reactor.cs b/Spring/Reactor/Reactor.cs new file mode 100644 index 0000000..bc87b31 --- /dev/null +++ b/Spring/Reactor/Reactor.cs @@ -0,0 +1,526 @@ +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using Spring.Utils; +using static Spring.Native; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Spring; + +/// +/// One reactor = one thread + one io_uring + one listening socket (SO_REUSEPORT) +/// + one connection map. The reactor thread is the sole writer of the SQ ring, +/// the kernel-shared buf_ring, and the connection map. Handlers may run on any +/// thread (e.g. resumed by a thread-pool timer after `await Task.Delay(1)`); +/// they reach the reactor only through two MPSC queues (`_returnQ`, `_flushQ`) +/// woken by an `eventfd` registered as a multishot poll in the ring. +/// +public sealed unsafe partial class Reactor +{ + public readonly int Id; + public Ring Ring = null!; // created on the reactor's own thread (DEFER_TASKRUN requires same-thread setup+enter) + public readonly Dictionary Connections = new(); + + /// Set by the Kestrel transport: vend each accepted connection instead of running an inline handler. + public Action? OnAccept; + + private int _listenFd; + private readonly ServerConfig _config; + private readonly ushort _port; + private readonly uint _ringEntries; + private readonly bool _incremental; + private readonly uint RecvBufferSize; + + // CQE user_data layout: kind tag in the high 32 bits, fd in the low 32. + private const ulong KindAccept = 1UL << 32; + private const ulong KindRecv = 2UL << 32; + private const ulong KindSend = 3UL << 32; + private const ulong KindWake = 4UL << 32; // eventfd-based cross-thread wake + + // Provided-buffer ring (one per reactor, shared by all its connections). + private const ushort BgId = 1; + private readonly uint BufferRingEntries; // power of two + private byte* _bufRing; // io_uring_buf_ring (kernel-shared) + private byte* _bufSlab; // contiguous slab of recv buffers + private uint _bufRingMask; + private ushort _bufRingTail; + + // Cross-thread wake mechanism: handlers running off-reactor enqueue work + // into these MPSC queues and `eventfd_write` _wakeFd; a multishot poll on + // _wakeFd registered with the ring delivers a CQE that wakes the reactor. + // When the caller is already the reactor thread (the common case — handler + // resumed inline from an IVTS SetResult), the Enqueue* methods bypass + // the queue and call the direct op, avoiding 2 syscalls per request. + private int _wakeFd; + private int _reactorThreadId; + private readonly Mpsc _returnQ = new(1 << 14); // 16384 slots + private readonly Mpsc _flushQ = new(1 << 12); // 4096 slots (only used for the rare EAGAIN remainder) + + // Connection pool. Reactor-thread-only — accept and teardown both run on + // this reactor, so a plain Stack is sufficient (no MPMC primitive + // needed). PoolMax caps the slab footprint per reactor: + // PoolMax × WriteSlabSize × ReactorCount = total reserved native memory. + private readonly int PoolMax; + private readonly Stack _pool; + + // Incremental-mode (IOU_PBUF_RING_INC) sizing. Each connection gets its own + // ring, so reserved native memory is bounded by: + // PoolMax × ConnBufRingEntries × IncRecvBufferSize × ReactorCount. + // Keep entries small — the point of incremental is that one buffer holds + // many reads, so you need few of them per connection. + private readonly int MaxConnections; // GID cap (one bgid per active connection) + private readonly int ConnBufRingEntries; // buffers per connection ring + private readonly uint IncRecvBufferSize; // bytes per buffer (filled incrementally) + + // Transient io_uring_enter errnos (Linux): interrupted, would-block, busy. + private const int EINTR = 4; + private const int EAGAIN = 11; + private const int EBUSY = 16; + + public Reactor(int id, ServerConfig config) + { + Id = id; + _config = config; + _port = config.Port; + _ringEntries = config.RingEntries; + _incremental = config.Incremental; + RecvBufferSize = (uint)config.RecvBufferSize; + BufferRingEntries = (uint)config.BufferRingEntries; + PoolMax = config.PoolMax; + MaxConnections = config.MaxConnections; + ConnBufRingEntries = config.ConnBufRingEntries; + IncRecvBufferSize = (uint)config.IncRecvBufferSize; + _pool = new Stack(config.PoolMax); + } + + // ========================================================================= + // Buffer ring + // ========================================================================= + + private void InitBufferRing() + { + nuint ringBytes = (nuint)BufferRingEntries * 16; + _bufRing = (byte*)NativeMemory.AlignedAlloc(ringBytes, 4096); + NativeMemory.Clear(_bufRing, ringBytes); + + nuint slabBytes = BufferRingEntries * (nuint)RecvBufferSize; + _bufSlab = (byte*)NativeMemory.AlignedAlloc(slabBytes, 64); + + _bufRingMask = BufferRingEntries - 1; + + var reg = new io_uring_buf_reg { + ring_addr = (ulong)_bufRing, + ring_entries = BufferRingEntries, + bgid = BgId, + }; + + int ret = io_uring_register(Ring.Fd, IORING_REGISTER_PBUF_RING, ®, 1); + if (ret < 0) + { + int err = Marshal.GetLastPInvokeError(); + + throw new InvalidOperationException($"register pbuf_ring failed: ret={ret} errno={err}"); + } + + // Populate every slot once. Slot 0 overlaps with the ring's tail field + // at offset 14, but we only write addr/len/bid (offsets 0..13) so tail + // stays at zero until we set it explicitly. + for (ushort bid = 0; bid < BufferRingEntries; bid++) { + byte* slot = _bufRing + (uint)bid * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)RecvBufferSize); + *(uint*)(slot + 8) = RecvBufferSize; + *(ushort*)(slot + 12) = bid; + } + _bufRingTail = (ushort)BufferRingEntries; + + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + + // Reactor-thread-only: writes the kernel-shared buf_ring tail directly. + // Off-reactor callers must use EnqueueReturnQ instead. + internal void ReturnBufferDirect(ushort bid) + { + byte* slot = _bufRing + (_bufRingTail & _bufRingMask) * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)RecvBufferSize); + *(uint*)(slot + 8) = RecvBufferSize; + *(ushort*)(slot + 12) = bid; + _bufRingTail++; + + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + + // ========================================================================= + // Cross-thread entry points (safe to call from any thread) + // ========================================================================= + + public void EnqueueReturnQ(ushort bid) + { + // Fast path: caller is the reactor thread (handler running inline from + // an IVTS SetResult). Go straight to the buf_ring — no queue, no syscall. + if (Environment.CurrentManagedThreadId == _reactorThreadId) + { + ReturnBufferDirect(bid); + return; + } + SpinWait sw = default; + while (!_returnQ.TryEnqueue(bid)) + { + sw.SpinOnce(); + } + WakeFdWrite(); + } + + internal void EnqueueFlush(int fd) + { + // Only reached on EAGAIN (socket send buffer full): hand the remaining + // bytes to the reactor (the ring's sole submitter), which sends them via + // io_uring from the connection's current WriteHead. + if (Environment.CurrentManagedThreadId == _reactorThreadId) + { + if (Connections.TryGetValue(fd, out var conn)) + { + SubmitSend(fd, conn.WriteBuffer + conn.WriteHead, (uint)(conn.WriteInFlight - conn.WriteHead)); + } + return; + } + SpinWait sw = default; + while (!_flushQ.TryEnqueue(fd)) + { + sw.SpinOnce(); + } + WakeFdWrite(); + } + + private void WakeFdWrite() + { + ulong v = 1; + // 8-byte write to eventfd increments its counter; the kernel marks the + // fd readable, which fires our registered multishot poll's next CQE. + write(_wakeFd, &v, 8); + } + + private void DrainReturnQ() + { + while (_returnQ.TryDequeue(out ushort bid)) + { + ReturnBufferDirect(bid); + } + } + + private void DrainFlushQ() + { + while (_flushQ.TryDequeue(out int fd)) + { + if (!Connections.TryGetValue(fd, out var conn)) + { + continue; + } + // EAGAIN remainder: send from the current WriteHead (the pool thread's + // inline send() already pushed everything up to WriteHead). + SubmitSend(fd, conn.WriteBuffer + conn.WriteHead, (uint)(conn.WriteInFlight - conn.WriteHead)); + } + } + + private void ArmWakePoll() + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_POLL_ADD; + sqe->fd = _wakeFd; + sqe->op_flags = POLLIN; // poll32_events lives at this offset + sqe->len = IORING_POLL_ADD_MULTI; // multishot — stays armed across CQEs + sqe->user_data = KindWake | (uint)_wakeFd; + } + + // ========================================================================= + // Main loop + // ========================================================================= + + public void Run() + { + _reactorThreadId = Environment.CurrentManagedThreadId; + + Ring = Ring.Create(_ringEntries); + _listenFd = OpenReusePortListener(_port); + + if (_incremental) + { + InitIncremental(); + } + else + { + InitBufferRing(); + } + + _wakeFd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (_wakeFd < 0) + { + throw new InvalidOperationException("eventfd failed"); + } + + Console.WriteLine($"[r{Id}] listening on 0.0.0.0:{_port} (incremental={_incremental})"); + SubmitAcceptMultishot(); + ArmWakePoll(); + + if (_incremental) + { + LoopIncremental(); + } + else + { + LoopShared(); + } + + close(_listenFd); + close(_wakeFd); + Ring.Dispose(); + } + + private void LoopShared() + { + while (true) + { + // Drain MPSC queues from off-reactor handlers. Cheap when empty. + DrainReturnQ(); + DrainFlushQ(); + + int rc = Ring.SubmitAndWait(1); + if (rc < 0 && rc != -EINTR && rc != -EAGAIN && rc != -EBUSY) + { + Console.Error.WriteLine($"[r{Id}] io_uring_enter failed: {rc}"); + break; + } + + uint ready = Ring.CqReady(); + for (uint i = 0; i < ready; i++) + { + Dispatch(in Ring.CqeAt(i)); + } + Ring.CqAdvance(ready); + } + } + + private void Dispatch(in IoUringCqe cqe) + { + ulong kind = cqe.user_data & 0xffffffff_00000000UL; + int fd = (int)(cqe.user_data & 0xffffffffUL); + bool more = (cqe.flags & IORING_CQE_F_MORE) != 0; + + if (kind == KindWake) + { + // Drain the eventfd counter so the next write re-triggers POLLIN + // (multishot poll is edge-triggered on the user_space side). + ulong drain; + read(_wakeFd, &drain, 8); + // The actual queue drains happen at the top of the next loop + // iteration — nothing else to do here. + if (!more) + { + ArmWakePoll(); + } + return; + } + + if (kind == KindAccept) + { + if (cqe.res >= 0) + { + int clientFd = cqe.res; + SetNoDelay(clientFd); + Connection conn = _pool.TryPop(out var pooled) + ? pooled.SetFd(clientFd) + : new Connection(this, clientFd, _config.WriteSlabSize); + Connections[clientFd] = conn; + conn.InitInputPipe(); // recv lands in a BCL pipe Kestrel reads + SubmitRecvMultishot(clientFd); + OnAccept?.Invoke(conn); // vend to the Kestrel transport + } + else + { + Console.Error.WriteLine($"[r{Id}] accept error: {cqe.res}"); + } + // Multishot accept stays armed; only re-arm if the kernel terminated it. + if (!more) + { + SubmitAcceptMultishot(); + } + } + else if (kind == KindRecv) + { + bool hasBuf = (cqe.flags & IORING_CQE_F_BUFFER) != 0; + ushort bid = hasBuf ? (ushort)(cqe.flags >> IORING_CQE_BUFFER_SHIFT) : (ushort)0; + + if (cqe.res <= 0) + { + // Peer EOF or recv error — reactor owns teardown. + if (hasBuf) + { + ReturnBufferDirect(bid); + } + if (Connections.Remove(fd, out var dyingConn)) + { + Recycle(dyingConn, fd); + } + return; + } + + if (!Connections.TryGetValue(fd, out var conn)) + { + // Straggler buffer for an already-closed connection. + if (hasBuf) + { + ReturnBufferDirect(bid); + } + return; + } + + // Kestrel: copy recv bytes into the BCL pipe, return the buffer. + if (hasBuf) + { + conn.FeedInput(_bufSlab + (nuint)bid * (nuint)RecvBufferSize, cqe.res); + ReturnBufferDirect(bid); + } + + if (!more) + { + SubmitRecvMultishot(fd); + } + } + else if (kind == KindSend) + { + if (!Connections.TryGetValue(fd, out var conn)) + { + return; + } + if (cqe.res <= 0) + { + // Send error — reactor owns teardown. + Connections.Remove(fd); + Recycle(conn, fd); + return; + } + conn.WriteHead += cqe.res; + if (conn.WriteHead < conn.WriteInFlight) + { + // Partial send: resubmit the remainder. + SubmitSend(fd, conn.WriteBuffer + conn.WriteHead, (uint)(conn.WriteInFlight - conn.WriteHead)); + return; + } + // Full target ack'd — resets buffer state and signals the awaiter. + conn.CompleteFlush(); + } + } + + // ========================================================================= + // SQE producers (reactor-thread-only — Connection.FlushAsync hands off via + // EnqueueFlush, which DrainFlushQ turns into SubmitSend on this thread) + // ========================================================================= + + private IoUringSqe* GetSqeOrFlush() + { + IoUringSqe* sqe = Ring.GetSqe(); + if (sqe != null) + { + return sqe; + } + + Ring.SubmitAndWait(0); + sqe = Ring.GetSqe(); + + if (sqe == null) + { + throw new InvalidOperationException("SQ full after flush"); + } + + return sqe; + } + + private void SubmitAcceptMultishot() + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_ACCEPT; + sqe->ioprio = IORING_ACCEPT_MULTISHOT; + sqe->fd = _listenFd; + sqe->user_data = KindAccept | (uint)_listenFd; + } + + private void SubmitRecvMultishot(int fd) => SubmitRecvMultishot(fd, BgId); + + private void SubmitRecvMultishot(int fd, ushort bgid) + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_RECV; + sqe->flags = IOSQE_BUFFER_SELECT; + sqe->ioprio = IORING_RECV_MULTISHOT; + sqe->fd = fd; + sqe->buf_index = bgid; // buffer-group id (shared BgId, or per-conn in incremental) + sqe->user_data = KindRecv | (uint)fd; + } + + private void SubmitSend(int fd, byte* buf, uint len) + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_SEND; + sqe->fd = fd; + sqe->addr = (ulong)buf; + sqe->len = len; + sqe->user_data = KindSend | (uint)fd; + } + + private void Recycle(Connection conn, int fd) + { + // Kestrel transport: the connection is owned by the off-reactor consumer. + // Signal EOF to its input pipe, wake any flush awaiter, close the fd, and + // drop it (GC's finalizer frees native). Do NOT Clear/pool — that resets + // the flush IVTS Kestrel may still be using off-reactor, and a recycled + // connection would race the consumer. + if (_incremental) + { + TeardownConnectionBufRing(conn); + } + conn.CompleteInput(); + conn.MarkClosed(); + close(fd); + } + + // Disable Nagle on an accepted connection. Must be set per-accepted-socket, + // not on the listener — TCP_NODELAY doesn't reliably inherit across accept, + // which is why zerg/terraform/rtr all set it on the client fd, not the listener. + private static void SetNoDelay(int fd) + { + int one = 1; + setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(int)); + } + + private static int OpenReusePortListener(ushort port) + { + int fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) + { + throw new InvalidOperationException($"socket failed: {fd}"); + } + + int one = 1; + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(int)); + setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(int)); + + sockaddr_in addr = default; + addr.sin_family = AF_INET; + addr.sin_port = Htons(port); + addr.sin_addr.s_addr = 0; // 0.0.0.0 + + if (bind(fd, &addr, (uint)sizeof(sockaddr_in)) < 0) + { + throw new InvalidOperationException("bind failed"); + } + + if (listen(fd, 128) < 0) + { + throw new InvalidOperationException("listen failed"); + } + + return fd; + } +} diff --git a/Spring/ServerConfig.cs b/Spring/ServerConfig.cs new file mode 100644 index 0000000..907e1b8 --- /dev/null +++ b/Spring/ServerConfig.cs @@ -0,0 +1,35 @@ +namespace Spring; + +/// +/// All server tunables in one place — replaces the consts that used to be +/// scattered across Program.cs and Reactor.cs. Defaults match the previous +/// hardcoded values; override via object initializer in Main, e.g.: +/// new ServerConfig { Port = 9000, ReactorCount = 8, Incremental = true }. +/// +public sealed record ServerConfig +{ + // Server-level. + public ushort Port { get; init; } = 8080; + public int ReactorCount { get; init; } = 12; + + // Handler style: false = raw ReadAsync/TryGetItem loop; true = PipeReader/PipeWriter. + public bool UsePipe { get; init; } = false; + + // io_uring SQ/CQ depth. + public uint RingEntries { get; init; } = 8192; + + // Shared buffer ring (used when Incremental == false). + public int RecvBufferSize { get; init; } = 32 * 1024; + public int BufferRingEntries { get; init; } = 4096; + + // Per-connection write slab + connection pool cap. + public int WriteSlabSize { get; init; } = 16 * 1024; + public int PoolMax { get; init; } = 1024; + + // Incremental mode (IOU_PBUF_RING_INC) — per-connection rings. + // reserved native memory ≈ PoolMax × ConnBufRingEntries × IncRecvBufferSize × ReactorCount. + public bool Incremental { get; init; } = false; + public int MaxConnections { get; init; } = 4096; // GID cap (one bgid per active connection) + public int ConnBufRingEntries { get; init; } = 16; // buffers per connection ring + public int IncRecvBufferSize { get; init; } = 4096; // bytes per buffer (filled incrementally) +} diff --git a/Spring/Spring.csproj b/Spring/Spring.csproj new file mode 100644 index 0000000..e2a2b73 --- /dev/null +++ b/Spring/Spring.csproj @@ -0,0 +1,15 @@ + + + + net10.0 + enable + enable + true + Spring + + + + + + + diff --git a/Spring/SpringEngine.cs b/Spring/SpringEngine.cs new file mode 100644 index 0000000..cddcadf --- /dev/null +++ b/Spring/SpringEngine.cs @@ -0,0 +1,43 @@ +using System.Threading.Channels; + +namespace Spring; + +/// +/// Owns N io_uring reactors (each its own SO_REUSEPORT listener) and funnels +/// accepted connections to the Kestrel transport via a channel. +/// +public sealed class SpringEngine +{ + private readonly Reactor[] _reactors; + private readonly Channel _accepted = + Channel.CreateUnbounded(new UnboundedChannelOptions { SingleReader = false, SingleWriter = false }); + + public SpringEngine(ServerConfig config) + { + _reactors = new Reactor[config.ReactorCount]; + for (int i = 0; i < config.ReactorCount; i++) + _reactors[i] = new Reactor(i, config) { OnAccept = OnReactorAccept }; + } + + private void OnReactorAccept(Connection conn) + { + // On the reactor thread, before the connection is used: flush continuations + // must run off-reactor (Kestrel pool), not inline on the reactor. + conn.UseAsyncContinuations(); + _accepted.Writer.TryWrite(conn); + } + + public void Start() + { + for (int i = 0; i < _reactors.Length; i++) + { + int idx = i; + var t = new Thread(() => _reactors[idx].Run()) { IsBackground = true, Name = $"spring-r{idx}" }; + t.Start(); + } + } + + public ValueTask AcceptAsync(CancellationToken ct) => _accepted.Reader.ReadAsync(ct); + + public void Stop() => _accepted.Writer.TryComplete(); +} diff --git a/Spring/SpringKestrel.cs b/Spring/SpringKestrel.cs new file mode 100644 index 0000000..f6bbf38 --- /dev/null +++ b/Spring/SpringKestrel.cs @@ -0,0 +1,144 @@ +using System.IO.Pipelines; +using System.Net; +using System.Threading.Channels; +using Microsoft.AspNetCore.Connections; +using Microsoft.AspNetCore.Connections.Features; +using Microsoft.AspNetCore.Hosting; +using Microsoft.AspNetCore.Http.Features; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; + +namespace Spring; + +internal sealed class SpringConnectionContext : ConnectionContext, + IConnectionIdFeature, IConnectionTransportFeature, IConnectionItemsFeature, + IConnectionLifetimeFeature, IConnectionEndPointFeature +{ + private static long s_id; + + private readonly Connection _conn; + private readonly ConnectionDualPipe _pipe; + private readonly CancellationTokenSource _closedCts = new(); + private readonly FeatureCollection _features = new(); + private bool _disposed; + + public SpringConnectionContext(Connection conn, EndPoint? localEndPoint) + { + _conn = conn; + _pipe = new ConnectionDualPipe(conn); // Input = InputPipe.Reader, Output = write-slab writer + + ConnectionId = $"spring-{Interlocked.Increment(ref s_id):x}"; + LocalEndPoint = localEndPoint; + Items = new ConnectionItems(); + ConnectionClosed = _closedCts.Token; + + _features.Set(this); + _features.Set(this); + _features.Set(this); + _features.Set(this); + _features.Set(this); + } + + public override string ConnectionId { get; set; } + public override IFeatureCollection Features => _features; + public override IDictionary Items { get; set; } + public override IDuplexPipe Transport + { + get => _pipe; + set => throw new NotSupportedException("Transport is owned by the Spring transport."); + } + public override CancellationToken ConnectionClosed { get; set; } + public override EndPoint? LocalEndPoint { get; set; } + public override EndPoint? RemoteEndPoint { get; set; } + + public override void Abort(ConnectionAbortedException abortReason) + { + try { _closedCts.Cancel(); } catch { } + try { _pipe.Input.Complete(abortReason); } catch { } + try { _pipe.Output.Complete(abortReason); } catch { } + } + + public override ValueTask DisposeAsync() + { + if (_disposed) return ValueTask.CompletedTask; + _disposed = true; + try { _closedCts.Cancel(); } catch { } + try { _pipe.Input.Complete(); } catch { } + try { _pipe.Output.Complete(); } catch { } + _closedCts.Dispose(); + return ValueTask.CompletedTask; + } +} + +internal sealed class SpringConnectionListener : IConnectionListener +{ + private readonly SpringEngine _engine; + + public SpringConnectionListener(SpringEngine engine, EndPoint endpoint) + { + _engine = engine; + EndPoint = endpoint; + } + + public EndPoint EndPoint { get; } + + public async ValueTask AcceptAsync(CancellationToken cancellationToken = default) + { + try + { + Connection conn = await _engine.AcceptAsync(cancellationToken).ConfigureAwait(false); + return new SpringConnectionContext(conn, EndPoint); + } + catch (OperationCanceledException) { return null; } + catch (ChannelClosedException) { return null; } + } + + public ValueTask UnbindAsync(CancellationToken cancellationToken = default) { _engine.Stop(); return ValueTask.CompletedTask; } + public ValueTask DisposeAsync() { _engine.Stop(); return ValueTask.CompletedTask; } +} + +public sealed class SpringTransportOptions +{ + public int ReactorCount { get; set; } = Math.Max(1, Environment.ProcessorCount); +} + +public sealed class SpringTransportFactory : IConnectionListenerFactory +{ + private readonly SpringTransportOptions _options; + private readonly ILogger _logger; + + public SpringTransportFactory(IOptions options, ILoggerFactory loggerFactory) + { + _options = options.Value; + _logger = loggerFactory.CreateLogger(); + } + + public ValueTask BindAsync(EndPoint endpoint, CancellationToken cancellationToken = default) + { + if (endpoint is not IPEndPoint ip) + throw new NotSupportedException($"Spring only supports {nameof(IPEndPoint)} (got {endpoint.GetType().Name})."); + + var config = new ServerConfig { Port = (ushort)ip.Port, ReactorCount = _options.ReactorCount, Incremental = false }; + var engine = new SpringEngine(config); + engine.Start(); + _logger.LogInformation("[spring] Bound :{Port} with {ReactorCount} io_uring reactor(s) (SO_REUSEPORT)", ip.Port, _options.ReactorCount); + + IConnectionListener listener = new SpringConnectionListener(engine, ip); + return ValueTask.FromResult(listener); + } +} + +public static class SpringKestrelExtensions +{ + /// Replace Kestrel's socket transport with the io_uring-based Spring transport. + public static IWebHostBuilder UseSpring(this IWebHostBuilder builder, Action? configure = null) + { + builder.ConfigureServices(services => + { + if (configure is not null) services.Configure(configure); + services.AddSingleton(); + }); + return builder; + } +} diff --git a/Spring/Utils/Mpsc.cs b/Spring/Utils/Mpsc.cs new file mode 100644 index 0000000..7d164bc --- /dev/null +++ b/Spring/Utils/Mpsc.cs @@ -0,0 +1,115 @@ +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Spring.Utils; + +/// +/// Bounded lock-free multi-producer / single-consumer queue. +/// +/// Dmitry Vyukov's bounded MPMC algorithm, specialised to one consumer. +/// Power-of-two capacity, zero-allocation after construction. Producers claim a +/// slot via CAS on the enqueue position (a failed TryEnqueue on a full queue +/// leaves the position untouched — no burned tickets); the single consumer +/// advances the dequeue position with a plain write. Each slot carries a +/// sequence number that coordinates ownership between producers and consumer. +/// +/// One generic queue serves every reactor handoff: Mpsc<ushort> for buffer +/// returns, Mpsc<int> for flush fds, Mpsc<ulong> for packed incremental +/// returns. T is unmanaged so each Cell is a blittable value type with no GC refs. +/// +internal sealed class Mpsc where T : unmanaged +{ + private struct Cell + { + public long Sequence; + public T Value; + } + + private readonly Cell[] _buffer; + private readonly int _mask; + + // PaddedLong is a top-level struct (not nested here) because the CLR forbids + // explicit layout on a type nested inside a generic. + private PaddedLong _enqueuePos; + private PaddedLong _dequeuePos; + + public Mpsc(int capacityPow2) + { + if (capacityPow2 < 2 || (capacityPow2 & (capacityPow2 - 1)) != 0) + throw new ArgumentException("Capacity must be a power of two >= 2.", nameof(capacityPow2)); + + _buffer = new Cell[capacityPow2]; + _mask = capacityPow2 - 1; + + for (int i = 0; i < capacityPow2; i++) + _buffer[i].Sequence = i; + } + + /// Multi-producer safe. Returns false if the queue is full. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryEnqueue(T item) + { + Cell[] buffer = _buffer; + int mask = _mask; + + while (true) + { + long pos = Volatile.Read(ref _enqueuePos.Value); + ref Cell cell = ref buffer[(int)pos & mask]; + + long seq = Volatile.Read(ref cell.Sequence); + long dif = seq - pos; + + if (dif == 0) + { + if (Interlocked.CompareExchange(ref _enqueuePos.Value, pos + 1, pos) == pos) + { + cell.Value = item; + Volatile.Write(ref cell.Sequence, pos + 1); + return true; + } + continue; // lost the race; reload and retry + } + + if (dif < 0) + return false; // slot not yet consumed → full + } + } + + /// Single-consumer only. Returns false if empty. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeue(out T item) + { + Cell[] buffer = _buffer; + int mask = _mask; + + long pos = _dequeuePos.Value; // single consumer: plain read + ref Cell cell = ref buffer[(int)pos & mask]; + + long seq = Volatile.Read(ref cell.Sequence); + long dif = seq - (pos + 1); + + if (dif == 0) + { + item = cell.Value; + _dequeuePos.Value = pos + 1; // single consumer: plain write + Volatile.Write(ref cell.Sequence, pos + mask + 1); // free slot for producers + return true; + } + + item = default; + return false; + } +} + +/// +/// A single long padded to a 64-byte cache line so the producer and consumer +/// positions never share a line (no false sharing). Top-level and non-generic +/// so it can legally use explicit layout. +/// +[StructLayout(LayoutKind.Explicit, Size = 64)] +internal struct PaddedLong +{ + [FieldOffset(0)] public long Value; +} diff --git a/Spring/Utils/RingSegment.cs b/Spring/Utils/RingSegment.cs new file mode 100644 index 0000000..dca7aef --- /dev/null +++ b/Spring/Utils/RingSegment.cs @@ -0,0 +1,31 @@ +using System.Buffers; + +namespace Spring.Utils; + +/// +/// One segment of a multi-buffer ReadOnlySequence<byte> built by the +/// ConnectionPipeReader when a single read spans more than one recv buffer. +/// BufferId is carried for debugging; buffer return is driven off the held +/// item list, not the segments. +/// +public sealed class RingSegment : ReadOnlySequenceSegment +{ + public ushort BufferId { get; } + + public RingSegment(ReadOnlyMemory memory, ushort bufferId) + { + Memory = memory; + BufferId = bufferId; + } + + public RingSegment Append(ReadOnlyMemory memory, ushort bufferId) + { + var next = new RingSegment(memory, bufferId) + { + RunningIndex = RunningIndex + Memory.Length + }; + + Next = next; + return next; + } +} diff --git a/Spring/Utils/SpscRecvRing.cs b/Spring/Utils/SpscRecvRing.cs new file mode 100644 index 0000000..5f04bf1 --- /dev/null +++ b/Spring/Utils/SpscRecvRing.cs @@ -0,0 +1,105 @@ +using System.Runtime.CompilerServices; + +// ReSharper disable SuggestVarOrType_BuiltInTypes + +namespace Spring.Utils; + +public sealed unsafe class SpscRecvRing +{ + public struct Item + { + public byte* Ptr; + public ushort Bid; + public int Len; + public bool HasBuffer; + public ushort Gen; // connection generation when enqueued (incremental return guard) + + public ReadOnlySpan AsSpan() => new(Ptr, Len); + + public UnmanagedMemoryManager AsMemoryManager() => new(Ptr, Len, Bid); + } + + private readonly Item[] _items; + private readonly int _mask; + private long _tail; + private long _head; + + public SpscRecvRing(int capacityPow2) + { + if (capacityPow2 <= 0 || (capacityPow2 & (capacityPow2 - 1)) != 0) + { + throw new ArgumentException("capacity must be a power of two", nameof(capacityPow2)); + } + + _items = new Item[capacityPow2]; + _mask = capacityPow2 - 1; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryEnqueue(in Item item) + { + long head = Volatile.Read(ref _head); + long tail = _tail; + + if ((ulong)(tail - head) >= (ulong)_items.Length) + { + return false; + } + + _items[(int)(tail & _mask)] = item; + Volatile.Write(ref _tail, tail + 1); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeue(out Item item) + { + long head = _head; + long tail = Volatile.Read(ref _tail); + + if (head >= tail) + { + item = default; + return false; + } + + item = _items[(int)(head & _mask)]; + Volatile.Write(ref _head, head + 1); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long SnapshotTail() => Volatile.Read(ref _tail); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeueUntil(long tailSnapshot, out Item item) + { + long head = _head; + + if (head >= tailSnapshot) + { + item = default; + return false; + } + + item = _items[(int)(head & _mask)]; + Volatile.Write(ref _head, head + 1); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool IsEmpty() => Volatile.Read(ref _head) >= Volatile.Read(ref _tail); + + // Reactor-thread-only, called during connection teardown (Clear) when no + // handler is consuming. Discards any leftover items so the recycled + // connection starts empty. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Reset() + { + _head = 0; + _tail = 0; + } +} diff --git a/Spring/Utils/UnmanagedMemoryManager.cs b/Spring/Utils/UnmanagedMemoryManager.cs new file mode 100644 index 0000000..ce113f6 --- /dev/null +++ b/Spring/Utils/UnmanagedMemoryManager.cs @@ -0,0 +1,32 @@ +using System.Buffers; + +namespace Spring.Utils; + +public sealed unsafe class UnmanagedMemoryManager : MemoryManager +{ + private readonly byte* _ptr; + private readonly int _length; + + public ushort BufferId { get; } + + public UnmanagedMemoryManager(byte* ptr, int length) + { + _ptr = ptr; + _length = length; + } + + public UnmanagedMemoryManager(byte* ptr, int length, ushort bufferId) + { + _ptr = ptr; + _length = length; + BufferId = bufferId; + } + + public override Span GetSpan() => new(_ptr, _length); + + public override MemoryHandle Pin(int elementIndex = 0) => new(_ptr + elementIndex); + + public override void Unpin() { } + + protected override void Dispose(bool disposing) { } +} diff --git a/Spring/io_uring/Native.cs b/Spring/io_uring/Native.cs new file mode 100644 index 0000000..d061d0f --- /dev/null +++ b/Spring/io_uring/Native.cs @@ -0,0 +1,170 @@ +using System.Runtime.InteropServices; + +namespace Spring; + +/// +/// All native interop in one file: io_uring syscalls, libc socket calls, +/// the kernel struct layouts they expect, and the constants needed to +/// drive a minimal io_uring loop. +/// +public static unsafe class Native { + private const long SYS_IO_URING_SETUP = 425; + private const long SYS_IO_URING_ENTER = 426; + private const long SYS_IO_URING_REGISTER = 427; + + public const byte IORING_OP_POLL_ADD = 6; + public const byte IORING_OP_ACCEPT = 13; + public const byte IORING_OP_SEND = 26; + public const byte IORING_OP_RECV = 27; + public const uint IORING_ENTER_GETEVENTS = 1u << 0; + public const long IORING_OFF_SQ_RING = 0; + public const long IORING_OFF_SQES = 0x10000000; + + // Multishot / buffer-ring goodies. + public const ushort IORING_ACCEPT_MULTISHOT = 1 << 0; + public const ushort IORING_RECV_MULTISHOT = 1 << 1; + public const byte IOSQE_BUFFER_SELECT = 1 << 5; + public const uint IORING_CQE_F_BUFFER = 1u << 0; + public const uint IORING_CQE_F_MORE = 1u << 1; + public const int IORING_CQE_BUFFER_SHIFT = 16; + public const uint IORING_REGISTER_PBUF_RING = 22; + public const uint IORING_UNREGISTER_PBUF_RING = 23; + public const uint IORING_POLL_ADD_MULTI = 1u << 0; + + // Incremental provided-buffer consumption (kernel 6.12+). IOU_PBUF_RING_INC + // is set in io_uring_buf_reg.flags at registration; IORING_CQE_F_BUF_MORE is + // set on recv CQEs while the kernel will keep appending to the same buffer. + public const ushort IOU_PBUF_RING_INC = 2; + public const uint IORING_CQE_F_BUF_MORE = 1u << 4; + + // eventfd flags + poll mask (used for the cross-thread wake mechanism). + public const int EFD_CLOEXEC = 0x80000; + public const int EFD_NONBLOCK = 0x800; + public const uint POLLIN = 0x0001; + + // Setup flags. SINGLE_ISSUER tells the kernel only one thread will submit + // to this ring (skips locking on the SQ). DEFER_TASKRUN defers completion + // processing until io_uring_enter(GETEVENTS), which lets the kernel batch + // work and avoids interrupting the reactor with task_work mid-flight. + public const uint IORING_SETUP_SINGLE_ISSUER = 1u << 12; + public const uint IORING_SETUP_DEFER_TASKRUN = 1u << 13; + + public const int PROT_READ = 1; + public const int PROT_WRITE = 2; + public const int MAP_SHARED = 1; + public const int MAP_POPULATE = 0x8000; + + public const int AF_INET = 2; + public const int SOCK_STREAM = 1; + public const int SOL_SOCKET = 1; + public const int SO_REUSEADDR = 2; + public const int SO_REUSEPORT = 15; + public const int IPPROTO_TCP = 6; + public const int TCP_NODELAY = 1; + + [DllImport("libc", EntryPoint = "syscall")] + private static extern long syscall3(long nr, uint a1, IoUringParams* a2); + + [DllImport("libc", EntryPoint = "syscall")] + private static extern long syscall6(long nr, uint a1, uint a2, uint a3, uint a4, void* a5, nuint a6); + + [DllImport("libc", EntryPoint = "syscall", SetLastError = true)] + private static extern long syscall4(long nr, uint a1, uint a2, void* a3, uint a4); + + public static int io_uring_setup(uint entries, IoUringParams* p) => + (int)syscall3(SYS_IO_URING_SETUP, entries, p); + + public static int io_uring_enter(int fd, uint toSubmit, uint minComplete, uint flags) => + (int)syscall6(SYS_IO_URING_ENTER, (uint)fd, toSubmit, minComplete, flags, null, 0); + + public static int io_uring_register(int fd, uint opcode, void* arg, uint nrArgs) => + (int)syscall4(SYS_IO_URING_REGISTER, (uint)fd, opcode, arg, nrArgs); + + [DllImport("libc")] public static extern void* mmap(void* addr, nuint length, int prot, int flags, int fd, long offset); + [DllImport("libc")] public static extern int munmap(void* addr, nuint length); + [DllImport("libc")] public static extern int close(int fd); + [DllImport("libc")] public static extern int socket(int domain, int type, int proto); + [DllImport("libc")] public static extern int bind(int fd, sockaddr_in* addr, uint len); + [DllImport("libc")] public static extern int listen(int fd, int backlog); + [DllImport("libc")] public static extern int setsockopt(int fd, int level, int optname, void* optval, uint optlen); + [DllImport("libc")] public static extern int eventfd(uint initval, int flags); + [DllImport("libc")] public static extern long write(int fd, void* buf, nuint count); + [DllImport("libc")] public static extern long read(int fd, void* buf, nuint count); + [DllImport("libc", SetLastError = true)] public static extern long send(int fd, byte* buf, nuint len, int flags); + + // Response send path: the Kestrel pool thread sends the response with plain + // send() — thread-safe, OFF the io_uring ring — so the ring stays single-issuer + // (reactor only) and keeps DEFER_TASKRUN. Same mechanism as Shrike's epoll send. + public const int MSG_NOSIGNAL = 0x4000; + public const int EAGAIN = 11; + public const int EWOULDBLOCK = 11; + + public static ushort Htons(ushort x) => (ushort)((x << 8) | (x >> 8)); + + // Kernel struct layouts (must match include/uapi/linux/io_uring.h) + [StructLayout(LayoutKind.Sequential)] + public struct SqRingOffsets { + public uint head, tail, ring_mask, ring_entries, flags, dropped, array, resv1; + public ulong resv2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct CqRingOffsets { + public uint head, tail, ring_mask, ring_entries, overflow, cqes, flags, resv1; + public ulong resv2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct IoUringParams { + public uint sq_entries, cq_entries, flags, sq_thread_cpu, sq_thread_idle; + public uint features, wq_fd, resv0, resv1, resv2; + public SqRingOffsets sq_off; + public CqRingOffsets cq_off; + } + + [StructLayout(LayoutKind.Explicit, Size = 64)] + public struct IoUringSqe { + [FieldOffset(0)] public byte opcode; + [FieldOffset(1)] public byte flags; + [FieldOffset(2)] public ushort ioprio; + [FieldOffset(4)] public int fd; + [FieldOffset(8)] public ulong off; + [FieldOffset(16)] public ulong addr; + [FieldOffset(24)] public uint len; + [FieldOffset(28)] public uint op_flags; + [FieldOffset(32)] public ulong user_data; + [FieldOffset(40)] public ushort buf_index; + [FieldOffset(42)] public ushort personality; + [FieldOffset(44)] public int splice_fd_in; + [FieldOffset(48)] public ulong addr3; + [FieldOffset(56)] public ulong __pad2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct IoUringCqe { + public ulong user_data; + public int res; + public uint flags; + } + + // Argument struct for IORING_REGISTER_PBUF_RING. + [StructLayout(LayoutKind.Sequential)] + public struct io_uring_buf_reg { + public ulong ring_addr; + public uint ring_entries; + public ushort bgid; + public ushort flags; + public ulong resv1, resv2, resv3; + } + + [StructLayout(LayoutKind.Sequential)] + public struct in_addr { public uint s_addr; } + + [StructLayout(LayoutKind.Sequential)] + public unsafe struct sockaddr_in { + public ushort sin_family; + public ushort sin_port; + public in_addr sin_addr; + public fixed byte sin_zero[8]; + } +} diff --git a/Spring/io_uring/Ring.cs b/Spring/io_uring/Ring.cs new file mode 100644 index 0000000..bd991c9 --- /dev/null +++ b/Spring/io_uring/Ring.cs @@ -0,0 +1,183 @@ +using System.Runtime.CompilerServices; +using static Spring.Native; + +// ReSharper disable SuggestVarOrType_BuiltInTypes +// ReSharper disable SuggestVarOrType_Elsewhere +#pragma warning disable CA1806 + +namespace Spring; + +public sealed unsafe class Ring : IDisposable +{ + private int _fd; + + public int Fd => _fd; + + private uint* _sqHead; + private uint* _sqTail; + private uint* _sqArray; + private uint _sqMask; + private uint _sqEntries; + private IoUringSqe* _sqes; + + private uint* _cqHead; + private uint* _cqTail; + private IoUringCqe* _cqes; + private uint _cqMask; + + private uint _sqeTail; + + private byte* _ringPtr; + private nuint _ringSize; + private byte* _sqePtr; + private nuint _sqeSize; + + public static Ring Create(uint entries) + { + IoUringParams ioUringParams = default; + // Single-issuer: only the reactor submits (accept/recv + the rare EAGAIN + // remainder send via the eventfd handoff). Responses are sent with plain + // send() OFF the ring, so we keep these fast-path flags (DEFER_TASKRUN + // batches completion processing and requires SINGLE_ISSUER). + ioUringParams.flags = IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN; + int fd = io_uring_setup(entries, &ioUringParams); + if (fd < 0) + { + throw new InvalidOperationException($"io_uring_setup failed: {fd}"); + } + + var ring = new Ring + { + _fd = fd, + _sqEntries = ioUringParams.sq_entries + }; + + nuint sqRingBytes = ioUringParams.sq_off.array + ioUringParams.sq_entries * sizeof(uint); + nuint cqRingBytes = ioUringParams.cq_off.cqes + ioUringParams.cq_entries * (nuint)sizeof(IoUringCqe); + nuint ringBytes = sqRingBytes > cqRingBytes ? sqRingBytes : cqRingBytes; + + void* ringMem = mmap(null, ringBytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); + if (ringMem == (void*)-1) + { + close(fd); + + throw new InvalidOperationException("mmap(SQ_RING) failed"); + } + ring._ringPtr = (byte*)ringMem; + ring._ringSize = ringBytes; + + nuint sqeBytes = ioUringParams.sq_entries * (nuint)sizeof(IoUringSqe); + void* sqeMem = mmap(null, sqeBytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); + if (sqeMem == (void*)-1) + { + munmap(ringMem, ringBytes); + close(fd); + + throw new InvalidOperationException("mmap(SQES) failed"); + } + ring._sqes = (IoUringSqe*)sqeMem; + ring._sqePtr = (byte*)sqeMem; + ring._sqeSize = sqeBytes; + + byte* ringPointer = (byte*)ringMem; + ring._sqHead = (uint*)(ringPointer + ioUringParams.sq_off.head); + ring._sqTail = (uint*)(ringPointer + ioUringParams.sq_off.tail); + ring._sqArray = (uint*)(ringPointer + ioUringParams.sq_off.array); + ring._sqMask = *(uint*)(ringPointer + ioUringParams.sq_off.ring_mask); + + ring._cqHead = (uint*)(ringPointer + ioUringParams.cq_off.head); + ring._cqTail = (uint*)(ringPointer + ioUringParams.cq_off.tail); + ring._cqes = (IoUringCqe*)(ringPointer + ioUringParams.cq_off.cqes); + ring._cqMask = *(uint*)(ringPointer + ioUringParams.cq_off.ring_mask); + + return ring; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public IoUringSqe* GetSqe() + { + uint head = Volatile.Read(ref *_sqHead); + + if (_sqeTail - head >= _sqEntries) + { + return null; + } + + uint slot = _sqeTail & _sqMask; + _sqArray[slot] = slot; + _sqeTail++; + + return &_sqes[slot]; + } + + public int SubmitAndWait(uint waitFor) + { + uint published = *_sqTail; + uint toSubmit = _sqeTail - published; + + if (toSubmit > 0) + { + Volatile.Write(ref *_sqTail, _sqeTail); + } + + if (toSubmit == 0 && waitFor == 0) return 0; + + uint flags = waitFor > 0 ? IORING_ENTER_GETEVENTS : 0; + + return io_uring_enter(_fd, toSubmit, waitFor, flags); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryGetCqe(out IoUringCqe cqe) + { + uint head = *_cqHead; + uint tail = Volatile.Read(ref *_cqTail); + + if (head == tail) + { + cqe = default; + + return false; + } + + cqe = _cqes[head & _cqMask]; + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void CqeSeen() => Volatile.Write(ref *_cqHead, *_cqHead + 1); + + // Batched CQ drain (liburing io_uring_for_each_cqe + io_uring_cq_advance): + // read the kernel-written tail once (acquire), process the whole batch, + // then publish the consumed head once (release) instead of once per CQE. + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint CqReady() => Volatile.Read(ref *_cqTail) - *_cqHead; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ref readonly IoUringCqe CqeAt(uint i) => ref _cqes[(*_cqHead + i) & _cqMask]; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void CqAdvance(uint n) => Volatile.Write(ref *_cqHead, *_cqHead + n); + + public void Dispose() + { + if (_ringPtr != null) + { + munmap(_ringPtr, _ringSize); _ringPtr = null; + } + + if (_sqePtr != null) + { + munmap(_sqePtr, _sqeSize); _sqePtr = null; + } + + if (_fd > 0) + { + close(_fd); _fd = 0; + } + } +} + +#pragma warning restore CA1806 diff --git a/Twinflow.Demo/Program.cs b/Twinflow.Demo/Program.cs new file mode 100644 index 0000000..15c1e1d --- /dev/null +++ b/Twinflow.Demo/Program.cs @@ -0,0 +1,12 @@ +using Twinflow; + +var builder = WebApplication.CreateBuilder(args); +builder.Logging.SetMinimumLevel(LogLevel.Warning); + +builder.WebHost + .UseTwinflow(o => o.ReactorCount = 8) + .ConfigureKestrel(o => o.ListenAnyIP(8080)); + +var app = builder.Build(); +app.MapGet("/", () => "Hello World!"); +app.Run(); diff --git a/Twinflow.Demo/Twinflow.Demo.csproj b/Twinflow.Demo/Twinflow.Demo.csproj new file mode 100644 index 0000000..244f60a --- /dev/null +++ b/Twinflow.Demo/Twinflow.Demo.csproj @@ -0,0 +1,16 @@ + + + + net10.0 + enable + enable + true + false + true + + + + + + + diff --git a/Twinflow/Native.cs b/Twinflow/Native.cs new file mode 100644 index 0000000..28ee938 --- /dev/null +++ b/Twinflow/Native.cs @@ -0,0 +1,166 @@ +namespace Twinflow; + +/// +/// All native interop in one file: io_uring syscalls, libc socket calls, +/// the kernel struct layouts they expect, and the constants needed to +/// drive a minimal io_uring loop. Vendored so Twinflow has no dependencies. +/// +public static unsafe class Native { + private const long SYS_IO_URING_SETUP = 425; + private const long SYS_IO_URING_ENTER = 426; + private const long SYS_IO_URING_REGISTER = 427; + + public const byte IORING_OP_POLL_ADD = 6; + public const byte IORING_OP_ACCEPT = 13; + public const byte IORING_OP_SEND = 26; + public const byte IORING_OP_RECV = 27; + public const uint IORING_ENTER_GETEVENTS = 1u << 0; + public const long IORING_OFF_SQ_RING = 0; + public const long IORING_OFF_SQES = 0x10000000; + + // Multishot / buffer-ring goodies. + public const ushort IORING_ACCEPT_MULTISHOT = 1 << 0; + public const ushort IORING_RECV_MULTISHOT = 1 << 1; + public const byte IOSQE_BUFFER_SELECT = 1 << 5; + public const uint IORING_CQE_F_BUFFER = 1u << 0; + public const uint IORING_CQE_F_MORE = 1u << 1; + public const int IORING_CQE_BUFFER_SHIFT = 16; + public const uint IORING_REGISTER_PBUF_RING = 22; + public const uint IORING_UNREGISTER_PBUF_RING = 23; + public const uint IORING_POLL_ADD_MULTI = 1u << 0; + + // eventfd flags + poll mask (kept for completeness; the lean reactor doesn't use them). + public const int EFD_CLOEXEC = 0x80000; + public const int EFD_NONBLOCK = 0x800; + public const uint POLLIN = 0x0001; + + // Setup flags. SINGLE_ISSUER tells the kernel only one thread will submit + // to this ring (skips locking on the SQ). DEFER_TASKRUN defers completion + // processing until io_uring_enter(GETEVENTS), which lets the kernel batch + // work and avoids interrupting the reactor with task_work mid-flight. + public const uint IORING_SETUP_SINGLE_ISSUER = 1u << 12; + public const uint IORING_SETUP_DEFER_TASKRUN = 1u << 13; + + public const int PROT_READ = 1; + public const int PROT_WRITE = 2; + public const int MAP_SHARED = 1; + public const int MAP_POPULATE = 0x8000; + + public const int AF_INET = 2; + public const int SOCK_STREAM = 1; + public const int SOL_SOCKET = 1; + public const int SO_REUSEADDR = 2; + public const int SO_REUSEPORT = 15; + public const int IPPROTO_TCP = 6; + public const int TCP_NODELAY = 1; + + [DllImport("libc", EntryPoint = "syscall")] + private static extern long syscall3(long nr, uint a1, IoUringParams* a2); + + [DllImport("libc", EntryPoint = "syscall")] + private static extern long syscall6(long nr, uint a1, uint a2, uint a3, uint a4, void* a5, nuint a6); + + [DllImport("libc", EntryPoint = "syscall", SetLastError = true)] + private static extern long syscall4(long nr, uint a1, uint a2, void* a3, uint a4); + + public static int io_uring_setup(uint entries, IoUringParams* p) => + (int)syscall3(SYS_IO_URING_SETUP, entries, p); + + public static int io_uring_enter(int fd, uint toSubmit, uint minComplete, uint flags) => + (int)syscall6(SYS_IO_URING_ENTER, (uint)fd, toSubmit, minComplete, flags, null, 0); + + public static int io_uring_register(int fd, uint opcode, void* arg, uint nrArgs) => + (int)syscall4(SYS_IO_URING_REGISTER, (uint)fd, opcode, arg, nrArgs); + + [DllImport("libc")] public static extern void* mmap(void* addr, nuint length, int prot, int flags, int fd, long offset); + [DllImport("libc")] public static extern int munmap(void* addr, nuint length); + [DllImport("libc")] public static extern int close(int fd); + [DllImport("libc")] public static extern int socket(int domain, int type, int proto); + [DllImport("libc")] public static extern int bind(int fd, sockaddr_in* addr, uint len); + [DllImport("libc")] public static extern int listen(int fd, int backlog); + [DllImport("libc")] public static extern int setsockopt(int fd, int level, int optname, void* optval, uint optlen); + [DllImport("libc")] public static extern int eventfd(uint initval, int flags); + [DllImport("libc")] public static extern long write(int fd, void* buf, nuint count); + [DllImport("libc")] public static extern long read(int fd, void* buf, nuint count); + [DllImport("libc")] public static extern int shutdown(int fd, int how); + [DllImport("libc", SetLastError = true)] public static extern long send(int fd, byte* buf, nuint len, int flags); + + // Response send path: the Kestrel pool thread sends the response with plain + // send() — thread-safe, OFF the io_uring ring — so the ring stays single-issuer + // (reactor only) and keeps DEFER_TASKRUN. + public const int MSG_NOSIGNAL = 0x4000; + public const int EAGAIN = 11; + public const int EWOULDBLOCK = 11; + public const int EINTR = 4; + public const int EBUSY = 16; + public const int SHUT_RDWR = 2; + + public static ushort Htons(ushort x) => (ushort)((x << 8) | (x >> 8)); + + // Kernel struct layouts (must match include/uapi/linux/io_uring.h) + [StructLayout(LayoutKind.Sequential)] + public struct SqRingOffsets { + public uint head, tail, ring_mask, ring_entries, flags, dropped, array, resv1; + public ulong resv2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct CqRingOffsets { + public uint head, tail, ring_mask, ring_entries, overflow, cqes, flags, resv1; + public ulong resv2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct IoUringParams { + public uint sq_entries, cq_entries, flags, sq_thread_cpu, sq_thread_idle; + public uint features, wq_fd, resv0, resv1, resv2; + public SqRingOffsets sq_off; + public CqRingOffsets cq_off; + } + + [StructLayout(LayoutKind.Explicit, Size = 64)] + public struct IoUringSqe { + [FieldOffset(0)] public byte opcode; + [FieldOffset(1)] public byte flags; + [FieldOffset(2)] public ushort ioprio; + [FieldOffset(4)] public int fd; + [FieldOffset(8)] public ulong off; + [FieldOffset(16)] public ulong addr; + [FieldOffset(24)] public uint len; + [FieldOffset(28)] public uint op_flags; + [FieldOffset(32)] public ulong user_data; + [FieldOffset(40)] public ushort buf_index; + [FieldOffset(42)] public ushort personality; + [FieldOffset(44)] public int splice_fd_in; + [FieldOffset(48)] public ulong addr3; + [FieldOffset(56)] public ulong __pad2; + } + + [StructLayout(LayoutKind.Sequential)] + public struct IoUringCqe { + public ulong user_data; + public int res; + public uint flags; + } + + // Argument struct for IORING_REGISTER_PBUF_RING. + [StructLayout(LayoutKind.Sequential)] + public struct io_uring_buf_reg { + public ulong ring_addr; + public uint ring_entries; + public ushort bgid; + public ushort flags; + public ulong resv1, resv2, resv3; + } + + [StructLayout(LayoutKind.Sequential)] + public struct in_addr { public uint s_addr; } + + [StructLayout(LayoutKind.Sequential)] + public unsafe struct sockaddr_in { + public ushort sin_family; + public ushort sin_port; + public in_addr sin_addr; + public fixed byte sin_zero[8]; + } +} diff --git a/Twinflow/Ring.cs b/Twinflow/Ring.cs new file mode 100644 index 0000000..af51ba0 --- /dev/null +++ b/Twinflow/Ring.cs @@ -0,0 +1,150 @@ +// ReSharper disable SuggestVarOrType_BuiltInTypes +// ReSharper disable SuggestVarOrType_Elsewhere +#pragma warning disable CA1806 + +namespace Twinflow; + +/// Thin wrapper over an io_uring SQ/CQ ring (single-issuer + DEFER_TASKRUN). +public sealed unsafe class Ring : IDisposable +{ + private int _fd; + + public int Fd => _fd; + + private uint* _sqHead; + private uint* _sqTail; + private uint* _sqArray; + private uint _sqMask; + private uint _sqEntries; + private IoUringSqe* _sqes; + + private uint* _cqHead; + private uint* _cqTail; + private IoUringCqe* _cqes; + private uint _cqMask; + + private uint _sqeTail; + + private byte* _ringPtr; + private nuint _ringSize; + private byte* _sqePtr; + private nuint _sqeSize; + + public static Ring Create(uint entries) + { + IoUringParams ioUringParams = default; + ioUringParams.flags = IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN; + int fd = io_uring_setup(entries, &ioUringParams); + if (fd < 0) + { + throw new InvalidOperationException($"io_uring_setup failed: {fd}"); + } + + var ring = new Ring + { + _fd = fd, + _sqEntries = ioUringParams.sq_entries + }; + + nuint sqRingBytes = ioUringParams.sq_off.array + ioUringParams.sq_entries * sizeof(uint); + nuint cqRingBytes = ioUringParams.cq_off.cqes + ioUringParams.cq_entries * (nuint)sizeof(IoUringCqe); + nuint ringBytes = sqRingBytes > cqRingBytes ? sqRingBytes : cqRingBytes; + + void* ringMem = mmap(null, ringBytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); + if (ringMem == (void*)-1) + { + close(fd); + throw new InvalidOperationException("mmap(SQ_RING) failed"); + } + ring._ringPtr = (byte*)ringMem; + ring._ringSize = ringBytes; + + nuint sqeBytes = ioUringParams.sq_entries * (nuint)sizeof(IoUringSqe); + void* sqeMem = mmap(null, sqeBytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); + if (sqeMem == (void*)-1) + { + munmap(ringMem, ringBytes); + close(fd); + throw new InvalidOperationException("mmap(SQES) failed"); + } + ring._sqes = (IoUringSqe*)sqeMem; + ring._sqePtr = (byte*)sqeMem; + ring._sqeSize = sqeBytes; + + byte* ringPointer = (byte*)ringMem; + ring._sqHead = (uint*)(ringPointer + ioUringParams.sq_off.head); + ring._sqTail = (uint*)(ringPointer + ioUringParams.sq_off.tail); + ring._sqArray = (uint*)(ringPointer + ioUringParams.sq_off.array); + ring._sqMask = *(uint*)(ringPointer + ioUringParams.sq_off.ring_mask); + + ring._cqHead = (uint*)(ringPointer + ioUringParams.cq_off.head); + ring._cqTail = (uint*)(ringPointer + ioUringParams.cq_off.tail); + ring._cqes = (IoUringCqe*)(ringPointer + ioUringParams.cq_off.cqes); + ring._cqMask = *(uint*)(ringPointer + ioUringParams.cq_off.ring_mask); + + return ring; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public IoUringSqe* GetSqe() + { + uint head = Volatile.Read(ref *_sqHead); + + if (_sqeTail - head >= _sqEntries) + { + return null; + } + + uint slot = _sqeTail & _sqMask; + _sqArray[slot] = slot; + _sqeTail++; + + return &_sqes[slot]; + } + + public int SubmitAndWait(uint waitFor) + { + uint published = *_sqTail; + uint toSubmit = _sqeTail - published; + + if (toSubmit > 0) + { + Volatile.Write(ref *_sqTail, _sqeTail); + } + + if (toSubmit == 0 && waitFor == 0) return 0; + + uint flags = waitFor > 0 ? IORING_ENTER_GETEVENTS : 0; + + return io_uring_enter(_fd, toSubmit, waitFor, flags); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint CqReady() => Volatile.Read(ref *_cqTail) - *_cqHead; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ref readonly IoUringCqe CqeAt(uint i) => ref _cqes[(*_cqHead + i) & _cqMask]; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void CqAdvance(uint n) => Volatile.Write(ref *_cqHead, *_cqHead + n); + + public void Dispose() + { + if (_ringPtr != null) + { + munmap(_ringPtr, _ringSize); _ringPtr = null; + } + + if (_sqePtr != null) + { + munmap(_sqePtr, _sqeSize); _sqePtr = null; + } + + if (_fd > 0) + { + close(_fd); _fd = 0; + } + } +} + +#pragma warning restore CA1806 diff --git a/Twinflow/Twinflow.csproj b/Twinflow/Twinflow.csproj new file mode 100644 index 0000000..9abfdb0 --- /dev/null +++ b/Twinflow/Twinflow.csproj @@ -0,0 +1,22 @@ + + + + net10.0 + enable + disable + true + + + Twinflow + 0.6.9 + Diogo Martins + A lean io_uring-based Kestrel transport for ASP.NET Core on Linux. A per-core io_uring reactor drives accept and recv (multishot + provided buffers); responses are sent with a thread-safe libc send() from the thread pool, so the ring stays single-issuer with DEFER_TASKRUN. Self-contained — no external dependencies. + io_uring;kestrel;aspnetcore;transport;linux;networking;performance + MIT + + + + + + + diff --git a/Twinflow/TwinflowConnection.cs b/Twinflow/TwinflowConnection.cs new file mode 100644 index 0000000..2cf2dd2 --- /dev/null +++ b/Twinflow/TwinflowConnection.cs @@ -0,0 +1,202 @@ +// ReSharper disable SuggestVarOrType_SimpleTypes +// ReSharper disable SuggestVarOrType_Elsewhere +// ReSharper disable SuggestVarOrType_BuiltInTypes +namespace Twinflow; + +/// +/// Lean connection: dual BCL Pipes + a libc-send pump, with recv driven by the +/// io_uring reactor. The reactor copies recv bytes into Input; Kestrel reads Input +/// and writes Output; the pump drains Output and sends via a thread-safe libc send(). +/// +internal sealed class TwinflowConnection +{ + public readonly int Fd; + public readonly long Id; + private readonly TwinflowReactor _reactor; + + public readonly Pipe Input; + public readonly Pipe Output; + + private int _refs = 2; // reactor (recv) side + pump side + private int _closed; + private static long s_id; + + public TwinflowConnection(int fd, TwinflowReactor reactor) + { + Fd = fd; + _reactor = reactor; + Id = Interlocked.Increment(ref s_id); + var o = new PipeOptions(pauseWriterThreshold: 0, resumeWriterThreshold: 0, useSynchronizationContext: false); + Input = new Pipe(o); + Output = new Pipe(o); + } + + public bool IsClosed => Volatile.Read(ref _closed) != 0; + + //Reactor thread: copy recv bytes (from the provided buffer) into Input + public unsafe void OnRecv(byte* ptr, int len) + { + Span dst = Input.Writer.GetSpan(len); + new ReadOnlySpan(ptr, len).CopyTo(dst); + + Input.Writer.Advance(len); + _ = Input.Writer.FlushAsync(); // schedules Kestrel's read on the thread pool + } + + //Output pump (thread pool): drain Output and send via libc send() + public async Task RunOutputPump() + { + PipeReader reader = Output.Reader; + try + { + while (true) + { + ReadResult r = await reader.ReadAsync().ConfigureAwait(false); + if (r.IsCanceled) break; + + ReadOnlySequence buf = r.Buffer; + bool fail = false; + + foreach (ReadOnlyMemory seg in buf) + { + int off = 0; + while (off < seg.Length) + { + int sent = TrySend(seg.Span.Slice(off), out bool wouldBlock, out bool closed); + if (closed) + { + fail = true; + break; + } + + if (sent > 0) + { + off += sent; + continue; + } + + if (wouldBlock) + { + // EAGAIN is ~never hit for small responses. Yield+retry instead of an + // io_uring POLLOUT round-trip. Harden here if you serve large bodies. + if (IsClosed) + { + fail = true; + break; + } + + await Task.Yield(); + } + } + + if (fail) + { + break; + } + } + + reader.AdvanceTo(buf.End); + if (fail || r.IsCompleted) + { + break; + } + } + } + catch + { + } + finally + { + try + { + await reader.CompleteAsync(); + } + catch + { + // Dindu nuffin + } + DecRef(); + } + } + + private unsafe int TrySend(ReadOnlySpan data, out bool wouldBlock, out bool closed) + { + wouldBlock = false; + closed = false; + if (data.IsEmpty) + { + return 0; + } + + long n; + fixed (byte* p = data) + { + n = send(Fd, p, (nuint)data.Length, MSG_NOSIGNAL); + } + if (n > 0) + { + return (int)n; + } + + int err = (n == 0) ? EAGAIN : Marshal.GetLastPInvokeError(); + if (err is EAGAIN or EWOULDBLOCK) + { + wouldBlock = true; + return 0; + } + if (err == EINTR) + { + return 0; + } + + closed = true; + + return 0; + } + + public void MarkClosed() + { + if (Interlocked.Exchange(ref _closed, 1) == 1) + { + return; + } + try + { + Input.Writer.Complete(); + } + catch + { + // Dindu nuffin + } + } + + public void DecRef() + { + if (Interlocked.Decrement(ref _refs) != 0) + { + return; + } + _reactor.Remove(this); + + close(Fd); + } + + /// + /// Kestrel finished/aborted the connection. Complete the pipes (pump exits) and + /// shutdown() the socket so the reactor's outstanding multishot recv completes and + /// releases its ref. The fd is closed once both refs drop to 0. + /// + public void OnKestrelClose() + { + MarkClosed(); + try + { + Output.Writer.Complete(); + } + catch + { + + } + shutdown(Fd, SHUT_RDWR); + } +} diff --git a/Twinflow/TwinflowEngine.cs b/Twinflow/TwinflowEngine.cs new file mode 100644 index 0000000..9f83b18 --- /dev/null +++ b/Twinflow/TwinflowEngine.cs @@ -0,0 +1,71 @@ +// ReSharper disable SuggestVarOrType_BuiltInTypes +namespace Twinflow; + +/// N io_uring reactors (SO_REUSEPORT) feeding accepted connections to Kestrel via a channel. +public sealed class TwinflowEngine +{ + private readonly List _reactors = []; + private readonly List _reactorThreads = []; + private readonly Channel _accepted = + Channel.CreateUnbounded(new UnboundedChannelOptions + { + SingleReader = false, + SingleWriter = false + }); + + private readonly ushort _port; + private readonly int _reactorCount; + private readonly int _recvBufferSize; + private readonly int _bufferRingEntries; + private readonly uint _ringEntries; + + internal TwinflowEngine(ushort port, int reactorCount, + uint ringEntries = 8192, int recvBufferSize = 16 * 1024, int bufferRingEntries = 4096) + { + _port = port; + _reactorCount = reactorCount; + _ringEntries = ringEntries; + _recvBufferSize = recvBufferSize; + _bufferRingEntries = bufferRingEntries; + } + + internal void Start() + { + for (int i = 0; i < _reactorCount; i++) + { + Console.WriteLine($"Starting twinflow reactor {i}"); + var reactor = new TwinflowReactor(i, _port, _ringEntries, _recvBufferSize, _bufferRingEntries) + { + OnAccept = OnReactorAccept + }; + + _reactors.Add(reactor); + + var t = new Thread(reactor.Run) + { + IsBackground = true, + Name = $"twinflow-r{i}" + }; + + _reactorThreads.Add(t); + t.Start(); + } + } + + private void OnReactorAccept(TwinflowConnection conn) => _accepted.Writer.TryWrite(conn); + + internal ValueTask AcceptAsync(CancellationToken ct) => _accepted.Reader.ReadAsync(ct); + + internal void Stop() + { + foreach (var r in _reactors) + { + r.Stop(); + } + foreach (var t in _reactorThreads) + { + t.Join(TimeSpan.FromSeconds(2)); + } + _accepted.Writer.TryComplete(); + } +} diff --git a/Twinflow/TwinflowKestrel.cs b/Twinflow/TwinflowKestrel.cs new file mode 100644 index 0000000..c4a2b8f --- /dev/null +++ b/Twinflow/TwinflowKestrel.cs @@ -0,0 +1,183 @@ +using System.Net; +using Microsoft.AspNetCore.Connections; +using Microsoft.AspNetCore.Connections.Features; +using Microsoft.AspNetCore.Hosting; +using Microsoft.AspNetCore.Http.Features; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; + +namespace Twinflow; + +internal sealed class TwinflowDuplexPipe : IDuplexPipe +{ + public TwinflowDuplexPipe(TwinflowConnection conn) + { + Input = conn.Input.Reader; + Output = conn.Output.Writer; + } + public PipeReader Input { get; } + public PipeWriter Output { get; } +} + +internal sealed class TwinflowConnectionContext : ConnectionContext, + IConnectionIdFeature, IConnectionTransportFeature, IConnectionItemsFeature, + IConnectionLifetimeFeature, IConnectionEndPointFeature +{ + private readonly TwinflowConnection _conn; + private readonly TwinflowDuplexPipe _pipe; + private readonly CancellationTokenSource _closedCts = new(); + private readonly FeatureCollection _features = new(); + private bool _disposed; + + public TwinflowConnectionContext(TwinflowConnection conn, EndPoint? localEndPoint) + { + _conn = conn; + _pipe = new TwinflowDuplexPipe(conn); + + ConnectionId = $"twinflow-{conn.Id:x}"; + LocalEndPoint = localEndPoint; + Items = new ConnectionItems(); + ConnectionClosed = _closedCts.Token; + + _features.Set(this); + _features.Set(this); + _features.Set(this); + _features.Set(this); + _features.Set(this); + } + + public override string ConnectionId { get; set; } + public override IFeatureCollection Features => _features; + public override IDictionary Items { get; set; } + public override IDuplexPipe Transport + { + get => _pipe; + set => throw new NotSupportedException("Transport is owned by the Twinflow transport."); + } + public override CancellationToken ConnectionClosed { get; set; } + public override EndPoint? LocalEndPoint { get; set; } + public override EndPoint? RemoteEndPoint { get; set; } + + public override void Abort(ConnectionAbortedException abortReason) + { + try + { + _closedCts.Cancel(); + } + catch + { + + } + _conn.OnKestrelClose(); + } + + public override ValueTask DisposeAsync() + { + if (_disposed) + { + return ValueTask.CompletedTask; + } + + _disposed = true; + try + { + _closedCts.Cancel(); + } + catch + { + + } + _conn.OnKestrelClose(); + _closedCts.Dispose(); + + return ValueTask.CompletedTask; + } +} + +internal sealed class TwinflowConnectionListener : IConnectionListener +{ + private readonly TwinflowEngine _engine; + public TwinflowConnectionListener(TwinflowEngine engine, EndPoint endpoint) { _engine = engine; EndPoint = endpoint; } + public EndPoint EndPoint { get; } + + public async ValueTask AcceptAsync(CancellationToken cancellationToken = default) + { + try + { + TwinflowConnection conn = await _engine.AcceptAsync(cancellationToken).ConfigureAwait(false); + return new TwinflowConnectionContext(conn, EndPoint); + } + catch (OperationCanceledException) + { + return null; + } + catch (ChannelClosedException) + { + return null; + } + } + + public ValueTask UnbindAsync(CancellationToken cancellationToken = default) { _engine.Stop(); return ValueTask.CompletedTask; } + + public ValueTask DisposeAsync() + { + _engine.Stop(); + + return ValueTask.CompletedTask; + } +} + +public sealed class TwinflowTransportOptions +{ + public int ReactorCount { get; set; } = Math.Max(1, Environment.ProcessorCount); +} + +public sealed class TwinflowTransportFactory : IConnectionListenerFactory +{ + private readonly TwinflowTransportOptions _options; + private readonly ILogger _logger; + + public TwinflowTransportFactory(IOptions options, ILoggerFactory loggerFactory) + { + _options = options.Value; + _logger = loggerFactory.CreateLogger(); + } + + public ValueTask BindAsync(EndPoint endpoint, CancellationToken cancellationToken = default) + { + if (endpoint is not IPEndPoint ip) + { + throw new NotSupportedException( + $"Twinflow only supports {nameof(IPEndPoint)} (got {endpoint.GetType().Name})."); + } + + var engine = new TwinflowEngine((ushort)ip.Port, _options.ReactorCount); + engine.Start(); + _logger.LogInformation("[twinflow] Bound :{Port} with {ReactorCount} io_uring reactor(s)", ip.Port, _options.ReactorCount); + + IConnectionListener listener = new TwinflowConnectionListener(engine, ip); + + return ValueTask.FromResult(listener); + } +} + +public static class TwinflowKestrelExtensions +{ + /// + /// Replace Kestrel's socket transport with Twinflow: a per-core io_uring reactor for + /// accept/recv and a thread-safe libc send() for the response. Linux only. + /// + public static IWebHostBuilder UseTwinflow(this IWebHostBuilder builder, Action? configure = null) + { + builder.ConfigureServices(services => + { + if (configure is not null) + { + services.Configure(configure); + } + services.AddSingleton(); + }); + return builder; + } +} diff --git a/Twinflow/TwinflowReactor.cs b/Twinflow/TwinflowReactor.cs new file mode 100644 index 0000000..827dea1 --- /dev/null +++ b/Twinflow/TwinflowReactor.cs @@ -0,0 +1,309 @@ +// ReSharper disable SuggestVarOrType_BuiltInTypes +// ReSharper disable SuggestVarOrType_Elsewhere +namespace Twinflow; + +/// +/// Lean io_uring reactor: one thread + one ring + one SO_REUSEPORT listener + +/// a shared provided-buffer ring. Multishot accept + multishot recv; recv bytes +/// are copied into the connection's Input pipe. The ring never submits a send — +/// the response goes out via libc send() in the connection's pump. +/// +internal sealed unsafe class TwinflowReactor +{ + public readonly int Id; + private readonly ushort _port; + private readonly uint _ringEntries; + private readonly uint _recvBufferSize; + private readonly uint _bufferRingEntries; + + private Ring _ring = null!; + private int _listenFd; + private int _wakeFd = -1; // eventfd: written by Stop() to wake the reactor for shutdown + private readonly ConcurrentDictionary _conns = new(); + private volatile bool _running = true; + + internal Action? OnAccept; + + // CQE user_data: kind tag in the high 32 bits, fd in the low 32. + private const ulong KindAccept = 1UL << 32; + private const ulong KindRecv = 2UL << 32; + private const ulong KindWake = 3UL << 32; // shutdown-only eventfd wake (never per-request) + private const ushort BgId = 1; + + // Shared provided-buffer ring (one per reactor). + private byte* _bufRing; + private byte* _bufSlab; + private uint _bufRingMask; + private ushort _bufRingTail; + + public TwinflowReactor(int id, ushort port, uint ringEntries, int recvBufferSize, int bufferRingEntries) + { + Id = id; + _port = port; + _ringEntries = ringEntries; + _recvBufferSize = (uint)recvBufferSize; + _bufferRingEntries = (uint)bufferRingEntries; + + // Shutdown-only wake: an eventfd the reactor polls via the ring. Stop() writes it + // to break the reactor out of io_uring_enter. Created here (any thread) so Stop() + // always sees a valid fd; armed as a multishot poll inside Run() (reactor thread). + _wakeFd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (_wakeFd < 0) throw new InvalidOperationException("eventfd failed"); + } + + public void Stop() + { + _running = false; + WakeFdWrite(); // break the reactor out of SubmitAndWait so it observes _running + } + + private void WakeFdWrite() + { + if (_wakeFd < 0) return; + ulong v = 1; + write(_wakeFd, &v, 8); + } + + internal void Remove(TwinflowConnection conn) + => _conns.TryRemove(new KeyValuePair(conn.Fd, conn)); + + public void Run() + { + _ring = Ring.Create(_ringEntries); + _listenFd = OpenReusePortListener(_port); + InitBufferRing(); + + SubmitAcceptMultishot(); + ArmWakePoll(); + + while (_running) + { + int rc = _ring.SubmitAndWait(1); + if (rc < 0 && rc != -EINTR && rc != -EAGAIN && rc != -EBUSY) + { + Console.Error.WriteLine($"[twinflow r{Id}] io_uring_enter failed: {rc}"); + + break; + } + + uint ready = _ring.CqReady(); + for (uint i = 0; i < ready; i++) + { + Dispatch(in _ring.CqeAt(i)); + } + _ring.CqAdvance(ready); + } + + close(_listenFd); + close(_wakeFd); + _ring.Dispose(); + FreeBufferRing(); + } + + private void Dispatch(in IoUringCqe cqe) + { + ulong kind = cqe.user_data & 0xffffffff_00000000UL; + int fd = (int)(cqe.user_data & 0xffffffffUL); + bool more = (cqe.flags & IORING_CQE_F_MORE) != 0; + + if (kind == KindWake) + { + // Drain the eventfd counter so a future write re-triggers POLLIN; the loop's + // while(_running) check performs the actual exit. + ulong drain; + read(_wakeFd, &drain, 8); + if (!more) ArmWakePoll(); + + return; + } + + if (kind == KindAccept) + { + if (cqe.res >= 0) + { + int clientFd = cqe.res; + SetNoDelay(clientFd); + var conn = new TwinflowConnection(clientFd, this); + _conns[clientFd] = conn; + SubmitRecvMultishot(clientFd); + OnAccept?.Invoke(conn); // vend to the Kestrel transport + _ = conn.RunOutputPump(); // start the libc-send pump (thread pool) + } + else + { + Console.Error.WriteLine($"[twinflow r{Id}] accept error: {cqe.res}"); + } + if (!more) + { + SubmitAcceptMultishot(); + } + return; + } + + if (kind == KindRecv) + { + bool hasBuf = (cqe.flags & IORING_CQE_F_BUFFER) != 0; + ushort bid = hasBuf ? (ushort)(cqe.flags >> IORING_CQE_BUFFER_SHIFT) : (ushort)0; + + if (cqe.res <= 0) + { + // Peer EOF / recv error / cancel (e.g. shutdown from Kestrel close). + if (hasBuf) + { + ReturnBufferDirect(bid); + } + if (_conns.TryRemove(fd, out var dying)) + { + dying.MarkClosed(); + dying.DecRef(); + } + return; + } + + if (hasBuf && _conns.TryGetValue(fd, out var conn)) + { + conn.OnRecv(_bufSlab + (nuint)bid * (nuint)_recvBufferSize, cqe.res); + + ReturnBufferDirect(bid); + } + else if (hasBuf) + { + ReturnBufferDirect(bid); + } + + if (!more) SubmitRecvMultishot(fd); + } + } + + // Provided-buffer ring + private void InitBufferRing() + { + nuint ringBytes = (nuint)_bufferRingEntries * 16; + _bufRing = (byte*)NativeMemory.AlignedAlloc(ringBytes, 4096); + NativeMemory.Clear(_bufRing, ringBytes); + + nuint slabBytes = _bufferRingEntries * (nuint)_recvBufferSize; + _bufSlab = (byte*)NativeMemory.AlignedAlloc(slabBytes, 64); + + _bufRingMask = _bufferRingEntries - 1; + + var reg = new io_uring_buf_reg + { + ring_addr = (ulong)_bufRing, + ring_entries = _bufferRingEntries, + bgid = BgId, + }; + + int ret = io_uring_register(_ring.Fd, IORING_REGISTER_PBUF_RING, ®, 1); + if (ret < 0) + { + int err = Marshal.GetLastPInvokeError(); + throw new InvalidOperationException($"register pbuf_ring failed: ret={ret} errno={err}"); + } + + for (ushort bid = 0; bid < _bufferRingEntries; bid++) + { + byte* slot = _bufRing + (uint)bid * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)_recvBufferSize); + *(uint*)(slot + 8) = _recvBufferSize; + *(ushort*)(slot + 12) = bid; + } + _bufRingTail = (ushort)_bufferRingEntries; + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + + private void ReturnBufferDirect(ushort bid) + { + byte* slot = _bufRing + (_bufRingTail & _bufRingMask) * 16; + *(ulong*)(slot + 0) = (ulong)(_bufSlab + bid * (nuint)_recvBufferSize); + *(uint*)(slot + 8) = _recvBufferSize; + *(ushort*)(slot + 12) = bid; + _bufRingTail++; + Volatile.Write(ref *(ushort*)(_bufRing + 14), _bufRingTail); + } + + private void FreeBufferRing() + { + if (_bufRing != null) { NativeMemory.AlignedFree(_bufRing); _bufRing = null; } + if (_bufSlab != null) { NativeMemory.AlignedFree(_bufSlab); _bufSlab = null; } + } + + // SQE producers (reactor-thread only) + private IoUringSqe* GetSqeOrFlush() + { + IoUringSqe* sqe = _ring.GetSqe(); + if (sqe != null) + { + return sqe; + } + + _ring.SubmitAndWait(0); + sqe = _ring.GetSqe(); + + if (sqe == null) + { + throw new InvalidOperationException("SQ full after flush"); + } + + return sqe; + } + + private void SubmitAcceptMultishot() + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_ACCEPT; + sqe->ioprio = IORING_ACCEPT_MULTISHOT; + sqe->fd = _listenFd; + sqe->user_data = KindAccept | (uint)_listenFd; + } + + private void ArmWakePoll() + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_POLL_ADD; + sqe->fd = _wakeFd; + sqe->op_flags = POLLIN; // poll32_events lives at this offset + sqe->len = IORING_POLL_ADD_MULTI; // stays armed across writes + sqe->user_data = KindWake | (uint)_wakeFd; + } + + private void SubmitRecvMultishot(int fd) + { + IoUringSqe* sqe = GetSqeOrFlush(); + Unsafe.InitBlockUnaligned(sqe, 0, 64); + sqe->opcode = IORING_OP_RECV; + sqe->flags = IOSQE_BUFFER_SELECT; + sqe->ioprio = IORING_RECV_MULTISHOT; + sqe->fd = fd; + sqe->buf_index = BgId; + sqe->user_data = KindRecv | (uint)fd; + } + + private static void SetNoDelay(int fd) + { + int one = 1; + setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(int)); + } + + private static int OpenReusePortListener(ushort port) + { + int fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) throw new InvalidOperationException($"socket failed: {fd}"); + + int one = 1; + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(int)); + setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(int)); + + sockaddr_in addr = default; + addr.sin_family = AF_INET; + addr.sin_port = Htons(port); + addr.sin_addr.s_addr = 0; // 0.0.0.0 + + if (bind(fd, &addr, (uint)sizeof(sockaddr_in)) < 0) throw new InvalidOperationException("bind failed"); + if (listen(fd, 128) < 0) throw new InvalidOperationException("listen failed"); + + return fd; + } +} diff --git a/Twinflow/_usings.cs b/Twinflow/_usings.cs new file mode 100644 index 0000000..fbcf541 --- /dev/null +++ b/Twinflow/_usings.cs @@ -0,0 +1,11 @@ +global using System; +global using System.Buffers; +global using System.Collections.Concurrent; +global using System.Collections.Generic; +global using System.IO.Pipelines; +global using System.Runtime.CompilerServices; +global using System.Runtime.InteropServices; +global using System.Threading; +global using System.Threading.Channels; +global using System.Threading.Tasks; +global using static Twinflow.Native; // Twinflow's own io_uring + libc bindings (self-contained) diff --git a/zerg.sln b/zerg.sln index 1654220..f34baf4 100644 --- a/zerg.sln +++ b/zerg.sln @@ -38,6 +38,26 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "core2", "core2\core2.csproj EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MinimaZero", "MinimaZero\MinimaZero.csproj", "{437D20B8-0E8C-4AE5-B987-ECE4F6BCDF9F}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Shrike", "Shrike\Shrike.csproj", "{B06D2AFF-CBB2-418C-8B98-D6AFCA253837}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Shrike.Playground", "Shrike.Playground\Shrike.Playground.csproj", "{7820B8BB-CDA6-4DA1-B416-4B480570783D}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "KestrelShrike", "KestrelShrike\KestrelShrike.csproj", "{2EA9B300-3F32-4D8B-9A7C-6854B79947D8}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "KestrelShrike.Demo", "KestrelShrike.Demo\KestrelShrike.Demo.csproj", "{4F6C0F15-5515-4048-8C1F-D75D480543F1}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Spring", "Spring\Spring.csproj", "{871CDB05-6C39-49DA-969C-635BCF47F35A}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Spring.Demo", "Spring.Demo\Spring.Demo.csproj", "{4A4E7EDD-597D-4D19-B931-2C798D8C1647}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Kite", "Kite\Kite.csproj", "{23C65D6D-A9D1-4AC3-AC88-70AF9CCFCEF8}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Kite.Demo", "Kite.Demo\Kite.Demo.csproj", "{C3D22CC7-D70A-4B79-AEA8-10BE22CAFD19}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Twinflow", "Twinflow\Twinflow.csproj", "{916A1CF0-6B4F-49AB-B8A8-17AB29F53E32}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Twinflow.Demo", "Twinflow.Demo\Twinflow.Demo.csproj", "{90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -276,6 +296,126 @@ Global {437D20B8-0E8C-4AE5-B987-ECE4F6BCDF9F}.Release|x64.Build.0 = Release|Any CPU {437D20B8-0E8C-4AE5-B987-ECE4F6BCDF9F}.Release|x86.ActiveCfg = Release|Any CPU {437D20B8-0E8C-4AE5-B987-ECE4F6BCDF9F}.Release|x86.Build.0 = Release|Any CPU + {B06D2AFF-CBB2-418C-8B98-D6AFCA253837}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B06D2AFF-CBB2-418C-8B98-D6AFCA253837}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B06D2AFF-CBB2-418C-8B98-D6AFCA253837}.Debug|x64.ActiveCfg = Debug|Any CPU + {B06D2AFF-CBB2-418C-8B98-D6AFCA253837}.Debug|x64.Build.0 = Debug|Any CPU + {B06D2AFF-CBB2-418C-8B98-D6AFCA253837}.Debug|x86.ActiveCfg = Debug|Any CPU + {B06D2AFF-CBB2-418C-8B98-D6AFCA253837}.Debug|x86.Build.0 = Debug|Any CPU + {B06D2AFF-CBB2-418C-8B98-D6AFCA253837}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B06D2AFF-CBB2-418C-8B98-D6AFCA253837}.Release|Any CPU.Build.0 = Release|Any CPU + {B06D2AFF-CBB2-418C-8B98-D6AFCA253837}.Release|x64.ActiveCfg = Release|Any CPU + {B06D2AFF-CBB2-418C-8B98-D6AFCA253837}.Release|x64.Build.0 = Release|Any CPU + {B06D2AFF-CBB2-418C-8B98-D6AFCA253837}.Release|x86.ActiveCfg = Release|Any CPU + {B06D2AFF-CBB2-418C-8B98-D6AFCA253837}.Release|x86.Build.0 = Release|Any CPU + {7820B8BB-CDA6-4DA1-B416-4B480570783D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {7820B8BB-CDA6-4DA1-B416-4B480570783D}.Debug|Any CPU.Build.0 = Debug|Any CPU + {7820B8BB-CDA6-4DA1-B416-4B480570783D}.Debug|x64.ActiveCfg = Debug|Any CPU + {7820B8BB-CDA6-4DA1-B416-4B480570783D}.Debug|x64.Build.0 = Debug|Any CPU + {7820B8BB-CDA6-4DA1-B416-4B480570783D}.Debug|x86.ActiveCfg = Debug|Any CPU + {7820B8BB-CDA6-4DA1-B416-4B480570783D}.Debug|x86.Build.0 = Debug|Any CPU + {7820B8BB-CDA6-4DA1-B416-4B480570783D}.Release|Any CPU.ActiveCfg = Release|Any CPU + {7820B8BB-CDA6-4DA1-B416-4B480570783D}.Release|Any CPU.Build.0 = Release|Any CPU + {7820B8BB-CDA6-4DA1-B416-4B480570783D}.Release|x64.ActiveCfg = Release|Any CPU + {7820B8BB-CDA6-4DA1-B416-4B480570783D}.Release|x64.Build.0 = Release|Any CPU + {7820B8BB-CDA6-4DA1-B416-4B480570783D}.Release|x86.ActiveCfg = Release|Any CPU + {7820B8BB-CDA6-4DA1-B416-4B480570783D}.Release|x86.Build.0 = Release|Any CPU + {2EA9B300-3F32-4D8B-9A7C-6854B79947D8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {2EA9B300-3F32-4D8B-9A7C-6854B79947D8}.Debug|Any CPU.Build.0 = Debug|Any CPU + {2EA9B300-3F32-4D8B-9A7C-6854B79947D8}.Debug|x64.ActiveCfg = Debug|Any CPU + {2EA9B300-3F32-4D8B-9A7C-6854B79947D8}.Debug|x64.Build.0 = Debug|Any CPU + {2EA9B300-3F32-4D8B-9A7C-6854B79947D8}.Debug|x86.ActiveCfg = Debug|Any CPU + {2EA9B300-3F32-4D8B-9A7C-6854B79947D8}.Debug|x86.Build.0 = Debug|Any CPU + {2EA9B300-3F32-4D8B-9A7C-6854B79947D8}.Release|Any CPU.ActiveCfg = Release|Any CPU + {2EA9B300-3F32-4D8B-9A7C-6854B79947D8}.Release|Any CPU.Build.0 = Release|Any CPU + {2EA9B300-3F32-4D8B-9A7C-6854B79947D8}.Release|x64.ActiveCfg = Release|Any CPU + {2EA9B300-3F32-4D8B-9A7C-6854B79947D8}.Release|x64.Build.0 = Release|Any CPU + {2EA9B300-3F32-4D8B-9A7C-6854B79947D8}.Release|x86.ActiveCfg = Release|Any CPU + {2EA9B300-3F32-4D8B-9A7C-6854B79947D8}.Release|x86.Build.0 = Release|Any CPU + {4F6C0F15-5515-4048-8C1F-D75D480543F1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {4F6C0F15-5515-4048-8C1F-D75D480543F1}.Debug|Any CPU.Build.0 = Debug|Any CPU + {4F6C0F15-5515-4048-8C1F-D75D480543F1}.Debug|x64.ActiveCfg = Debug|Any CPU + {4F6C0F15-5515-4048-8C1F-D75D480543F1}.Debug|x64.Build.0 = Debug|Any CPU + {4F6C0F15-5515-4048-8C1F-D75D480543F1}.Debug|x86.ActiveCfg = Debug|Any CPU + {4F6C0F15-5515-4048-8C1F-D75D480543F1}.Debug|x86.Build.0 = Debug|Any CPU + {4F6C0F15-5515-4048-8C1F-D75D480543F1}.Release|Any CPU.ActiveCfg = Release|Any CPU + {4F6C0F15-5515-4048-8C1F-D75D480543F1}.Release|Any CPU.Build.0 = Release|Any CPU + {4F6C0F15-5515-4048-8C1F-D75D480543F1}.Release|x64.ActiveCfg = Release|Any CPU + {4F6C0F15-5515-4048-8C1F-D75D480543F1}.Release|x64.Build.0 = Release|Any CPU + {4F6C0F15-5515-4048-8C1F-D75D480543F1}.Release|x86.ActiveCfg = Release|Any CPU + {4F6C0F15-5515-4048-8C1F-D75D480543F1}.Release|x86.Build.0 = Release|Any CPU + {871CDB05-6C39-49DA-969C-635BCF47F35A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {871CDB05-6C39-49DA-969C-635BCF47F35A}.Debug|Any CPU.Build.0 = Debug|Any CPU + {871CDB05-6C39-49DA-969C-635BCF47F35A}.Debug|x64.ActiveCfg = Debug|Any CPU + {871CDB05-6C39-49DA-969C-635BCF47F35A}.Debug|x64.Build.0 = Debug|Any CPU + {871CDB05-6C39-49DA-969C-635BCF47F35A}.Debug|x86.ActiveCfg = Debug|Any CPU + {871CDB05-6C39-49DA-969C-635BCF47F35A}.Debug|x86.Build.0 = Debug|Any CPU + {871CDB05-6C39-49DA-969C-635BCF47F35A}.Release|Any CPU.ActiveCfg = Release|Any CPU + {871CDB05-6C39-49DA-969C-635BCF47F35A}.Release|Any CPU.Build.0 = Release|Any CPU + {871CDB05-6C39-49DA-969C-635BCF47F35A}.Release|x64.ActiveCfg = Release|Any CPU + {871CDB05-6C39-49DA-969C-635BCF47F35A}.Release|x64.Build.0 = Release|Any CPU + {871CDB05-6C39-49DA-969C-635BCF47F35A}.Release|x86.ActiveCfg = Release|Any CPU + {871CDB05-6C39-49DA-969C-635BCF47F35A}.Release|x86.Build.0 = Release|Any CPU + {4A4E7EDD-597D-4D19-B931-2C798D8C1647}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {4A4E7EDD-597D-4D19-B931-2C798D8C1647}.Debug|Any CPU.Build.0 = Debug|Any CPU + {4A4E7EDD-597D-4D19-B931-2C798D8C1647}.Debug|x64.ActiveCfg = Debug|Any CPU + {4A4E7EDD-597D-4D19-B931-2C798D8C1647}.Debug|x64.Build.0 = Debug|Any CPU + {4A4E7EDD-597D-4D19-B931-2C798D8C1647}.Debug|x86.ActiveCfg = Debug|Any CPU + {4A4E7EDD-597D-4D19-B931-2C798D8C1647}.Debug|x86.Build.0 = Debug|Any CPU + {4A4E7EDD-597D-4D19-B931-2C798D8C1647}.Release|Any CPU.ActiveCfg = Release|Any CPU + {4A4E7EDD-597D-4D19-B931-2C798D8C1647}.Release|Any CPU.Build.0 = Release|Any CPU + {4A4E7EDD-597D-4D19-B931-2C798D8C1647}.Release|x64.ActiveCfg = Release|Any CPU + {4A4E7EDD-597D-4D19-B931-2C798D8C1647}.Release|x64.Build.0 = Release|Any CPU + {4A4E7EDD-597D-4D19-B931-2C798D8C1647}.Release|x86.ActiveCfg = Release|Any CPU + {4A4E7EDD-597D-4D19-B931-2C798D8C1647}.Release|x86.Build.0 = Release|Any CPU + {23C65D6D-A9D1-4AC3-AC88-70AF9CCFCEF8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {23C65D6D-A9D1-4AC3-AC88-70AF9CCFCEF8}.Debug|Any CPU.Build.0 = Debug|Any CPU + {23C65D6D-A9D1-4AC3-AC88-70AF9CCFCEF8}.Debug|x64.ActiveCfg = Debug|Any CPU + {23C65D6D-A9D1-4AC3-AC88-70AF9CCFCEF8}.Debug|x64.Build.0 = Debug|Any CPU + {23C65D6D-A9D1-4AC3-AC88-70AF9CCFCEF8}.Debug|x86.ActiveCfg = Debug|Any CPU + {23C65D6D-A9D1-4AC3-AC88-70AF9CCFCEF8}.Debug|x86.Build.0 = Debug|Any CPU + {23C65D6D-A9D1-4AC3-AC88-70AF9CCFCEF8}.Release|Any CPU.ActiveCfg = Release|Any CPU + {23C65D6D-A9D1-4AC3-AC88-70AF9CCFCEF8}.Release|Any CPU.Build.0 = Release|Any CPU + {23C65D6D-A9D1-4AC3-AC88-70AF9CCFCEF8}.Release|x64.ActiveCfg = Release|Any CPU + {23C65D6D-A9D1-4AC3-AC88-70AF9CCFCEF8}.Release|x64.Build.0 = Release|Any CPU + {23C65D6D-A9D1-4AC3-AC88-70AF9CCFCEF8}.Release|x86.ActiveCfg = Release|Any CPU + {23C65D6D-A9D1-4AC3-AC88-70AF9CCFCEF8}.Release|x86.Build.0 = Release|Any CPU + {C3D22CC7-D70A-4B79-AEA8-10BE22CAFD19}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {C3D22CC7-D70A-4B79-AEA8-10BE22CAFD19}.Debug|Any CPU.Build.0 = Debug|Any CPU + {C3D22CC7-D70A-4B79-AEA8-10BE22CAFD19}.Debug|x64.ActiveCfg = Debug|Any CPU + {C3D22CC7-D70A-4B79-AEA8-10BE22CAFD19}.Debug|x64.Build.0 = Debug|Any CPU + {C3D22CC7-D70A-4B79-AEA8-10BE22CAFD19}.Debug|x86.ActiveCfg = Debug|Any CPU + {C3D22CC7-D70A-4B79-AEA8-10BE22CAFD19}.Debug|x86.Build.0 = Debug|Any CPU + {C3D22CC7-D70A-4B79-AEA8-10BE22CAFD19}.Release|Any CPU.ActiveCfg = Release|Any CPU + {C3D22CC7-D70A-4B79-AEA8-10BE22CAFD19}.Release|Any CPU.Build.0 = Release|Any CPU + {C3D22CC7-D70A-4B79-AEA8-10BE22CAFD19}.Release|x64.ActiveCfg = Release|Any CPU + {C3D22CC7-D70A-4B79-AEA8-10BE22CAFD19}.Release|x64.Build.0 = Release|Any CPU + {C3D22CC7-D70A-4B79-AEA8-10BE22CAFD19}.Release|x86.ActiveCfg = Release|Any CPU + {C3D22CC7-D70A-4B79-AEA8-10BE22CAFD19}.Release|x86.Build.0 = Release|Any CPU + {916A1CF0-6B4F-49AB-B8A8-17AB29F53E32}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {916A1CF0-6B4F-49AB-B8A8-17AB29F53E32}.Debug|Any CPU.Build.0 = Debug|Any CPU + {916A1CF0-6B4F-49AB-B8A8-17AB29F53E32}.Debug|x64.ActiveCfg = Debug|Any CPU + {916A1CF0-6B4F-49AB-B8A8-17AB29F53E32}.Debug|x64.Build.0 = Debug|Any CPU + {916A1CF0-6B4F-49AB-B8A8-17AB29F53E32}.Debug|x86.ActiveCfg = Debug|Any CPU + {916A1CF0-6B4F-49AB-B8A8-17AB29F53E32}.Debug|x86.Build.0 = Debug|Any CPU + {916A1CF0-6B4F-49AB-B8A8-17AB29F53E32}.Release|Any CPU.ActiveCfg = Release|Any CPU + {916A1CF0-6B4F-49AB-B8A8-17AB29F53E32}.Release|Any CPU.Build.0 = Release|Any CPU + {916A1CF0-6B4F-49AB-B8A8-17AB29F53E32}.Release|x64.ActiveCfg = Release|Any CPU + {916A1CF0-6B4F-49AB-B8A8-17AB29F53E32}.Release|x64.Build.0 = Release|Any CPU + {916A1CF0-6B4F-49AB-B8A8-17AB29F53E32}.Release|x86.ActiveCfg = Release|Any CPU + {916A1CF0-6B4F-49AB-B8A8-17AB29F53E32}.Release|x86.Build.0 = Release|Any CPU + {90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}.Debug|Any CPU.Build.0 = Debug|Any CPU + {90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}.Debug|x64.ActiveCfg = Debug|Any CPU + {90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}.Debug|x64.Build.0 = Debug|Any CPU + {90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}.Debug|x86.ActiveCfg = Debug|Any CPU + {90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}.Debug|x86.Build.0 = Debug|Any CPU + {90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}.Release|Any CPU.ActiveCfg = Release|Any CPU + {90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}.Release|Any CPU.Build.0 = Release|Any CPU + {90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}.Release|x64.ActiveCfg = Release|Any CPU + {90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}.Release|x64.Build.0 = Release|Any CPU + {90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}.Release|x86.ActiveCfg = Release|Any CPU + {90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE