From 4de799249ea40f7567107d57397b38cd1d1cabf7 Mon Sep 17 00:00:00 2001 From: Endel Dreyer Date: Thu, 19 Mar 2026 00:57:33 -0300 Subject: [PATCH 1/5] Replace RangeSet with bitmap for received packet number tracking Inspired by lxin/quic kernel pnspace.c: O(1) duplicate detection via bit test instead of O(n) range scan. Fixed 4096-bit sliding window (512 bytes, zero heap allocation) replaces heap-allocated ArrayList. - PacketNumberBitmap: mark/contains O(1), getRanges scans for ACK gen - ReceivedPacketTracker no longer needs allocator for packet tracking - removeBelow shifts bitmap forward for ACK-of-ACK pruning - Window auto-shifts on large PN jumps (3/4 history preserved) --- src/quic/ack_handler.zig | 45 +++-- src/quic/ranges.zig | 371 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 400 insertions(+), 16 deletions(-) diff --git a/src/quic/ack_handler.zig b/src/quic/ack_handler.zig index dccce6a..0024acd 100644 --- a/src/quic/ack_handler.zig +++ b/src/quic/ack_handler.zig @@ -4,6 +4,8 @@ const testing = std.testing; const ranges = @import("ranges.zig"); const RangeSet = ranges.RangeSet; +const Range = ranges.Range; +const PacketNumberBitmap = ranges.PacketNumberBitmap; const rtt_mod = @import("rtt.zig"); const RttStats = rtt_mod.RttStats; const frame_mod = @import("frame.zig"); @@ -292,9 +294,11 @@ pub const SentPacketTracker = struct { }; /// Tracks received packets for generating ACK frames. +/// Uses a fixed-size sliding-window bitmap for O(1) duplicate detection +/// (inspired by lxin/quic kernel pnspace.c). pub const ReceivedPacketTracker = struct { allocator: Allocator, - received: RangeSet, + received: PacketNumberBitmap = .{}, largest_received: ?u64 = null, largest_received_time: i64 = 0, ack_eliciting_since_last_ack: u32 = 0, @@ -313,17 +317,15 @@ pub const ReceivedPacketTracker = struct { pub fn init(allocator: Allocator) ReceivedPacketTracker { return .{ .allocator = allocator, - .received = RangeSet.init(allocator), }; } pub fn deinit(self: *ReceivedPacketTracker) void { - self.received.deinit(); + _ = self; } pub fn onPacketReceived(self: *ReceivedPacketTracker, pn: u64, ack_eliciting: bool, now: i64, ecn: u2) !void { - if (self.received.contains(pn)) return; - try self.received.add(pn); + if (!self.received.mark(pn)) return; // duplicate if (self.largest_received == null or pn > self.largest_received.?) { self.largest_received = pn; @@ -391,6 +393,12 @@ pub const ReceivedPacketTracker = struct { self.received.removeBelow(below); } + /// Backwards-compatible accessor: returns number of distinct ranges in the bitmap. + pub fn rangeCount(self: *const ReceivedPacketTracker) usize { + var buf: [MAX_ACK_RANGES + 1]Range = undefined; + return self.received.getRanges(&buf); + } + /// Check if there are any unacknowledged ack-eliciting packets. /// Used by the packet packer to piggyback ACKs on outgoing data packets. pub fn hasUnackedAckEliciting(self: *const ReceivedPacketTracker) bool { @@ -417,10 +425,14 @@ pub const ReceivedPacketTracker = struct { } const largest = self.largest_received orelse return null; - const recv_ranges = self.received.getRanges(); - if (recv_ranges.len == 0) return null; - const first_range = recv_ranges[0]; + // Scan bitmap to produce ranges in descending order. + // First slot is used for the first_ack_range; rest become additional ACK ranges. + var all_ranges: [MAX_ACK_RANGES + 1]Range = undefined; + const total = self.received.getRanges(&all_ranges); + if (total == 0) return null; + + const first_range = all_ranges[0]; const first_ack_range = largest - first_range.start; const ack_delay_ns = now - self.largest_received_time; @@ -431,11 +443,11 @@ pub const ReceivedPacketTracker = struct { self.ack_alarm = null; self.ack_eliciting_since_last_ack = 0; - // Populate additional ACK ranges from received range set + // Populate additional ACK ranges from bitmap scan var ack_ranges: [MAX_ACK_RANGES]AckRange = undefined; var ack_range_count: u8 = 0; - if (recv_ranges.len > 1) { - for (recv_ranges[1..]) |r| { + if (total > 1) { + for (all_ranges[1..total]) |r| { if (ack_range_count >= MAX_ACK_RANGES) break; ack_ranges[ack_range_count] = .{ .start = r.start, .end = r.end }; ack_range_count += 1; @@ -889,14 +901,15 @@ test "ReceivedPacketTracker: pruneAckedRanges removes old ranges" { } // Ranges should cover 0..9 - try testing.expectEqual(@as(usize, 1), tracker.received.getRanges().len); - try testing.expectEqual(@as(u64, 0), tracker.received.getRanges()[0].start); + try testing.expectEqual(@as(usize, 1), tracker.rangeCount()); // Prune below 5 — ranges should now start at 5 tracker.pruneAckedRanges(5); - try testing.expectEqual(@as(usize, 1), tracker.received.getRanges().len); - try testing.expectEqual(@as(u64, 5), tracker.received.getRanges()[0].start); - try testing.expectEqual(@as(u64, 9), tracker.received.getRanges()[0].end); + try testing.expectEqual(@as(usize, 1), tracker.rangeCount()); + // Verify 0-4 are gone, 5-9 remain + try testing.expect(!tracker.received.contains(4)); + try testing.expect(tracker.received.contains(5)); + try testing.expect(tracker.received.contains(9)); } // App-limited cwnd suppression (RFC 9002 §7.8) diff --git a/src/quic/ranges.zig b/src/quic/ranges.zig index 1ef651b..023290e 100644 --- a/src/quic/ranges.zig +++ b/src/quic/ranges.zig @@ -166,6 +166,209 @@ pub const RangeSet = struct { } }; +/// Sliding-window bitmap for O(1) packet number tracking. +/// Replaces RangeSet for received packet number duplicate detection. +/// +/// Design (inspired by lxin/quic kernel pnspace.c): +/// - Fixed 4096-bit bitmap (512 bytes, no heap allocation) +/// - O(1) contains() via bit test +/// - O(1) mark() via bit set +/// - getRanges() scans bitmap to produce descending Range array for ACK frames +/// - removeBelow() shifts the window forward +pub const PacketNumberBitmap = struct { + const BITMAP_BITS: usize = 4096; + const MaskInt = std.DynamicBitSet.MaskInt; + const MASK_BITS = @bitSizeOf(MaskInt); + const NUM_MASKS: usize = BITMAP_BITS / MASK_BITS; + + /// The bitmap, stored inline (512 bytes for 4096 bits). + masks: [NUM_MASKS]MaskInt = [_]MaskInt{0} ** NUM_MASKS, + + /// Base packet number: bitmap[0] corresponds to this PN. + base: u64 = 0, + + /// Highest marked packet number (for fast largest() query). + highest: ?u64 = null, + + /// Number of set bits. + count: u32 = 0, + + /// Mark a packet number as received. Returns true if newly added (not duplicate). + pub fn mark(self: *PacketNumberBitmap, pn: u64) bool { + if (self.highest != null and pn < self.base) return false; // too old + + // If pn is beyond current window, shift the window forward + if (pn >= self.base + BITMAP_BITS) { + self.shiftTo(pn); + } + + const offset = pn - self.base; + const idx = @as(usize, @intCast(offset / MASK_BITS)); + const bit: std.math.Log2Int(MaskInt) = @intCast(offset % MASK_BITS); + + if (self.masks[idx] & (@as(MaskInt, 1) << bit) != 0) { + return false; // duplicate + } + + self.masks[idx] |= @as(MaskInt, 1) << bit; + self.count += 1; + + if (self.highest == null or pn > self.highest.?) { + self.highest = pn; + } + return true; + } + + /// O(1) duplicate detection. + pub fn contains(self: *const PacketNumberBitmap, pn: u64) bool { + if (self.highest == null) return false; + if (pn < self.base or pn >= self.base + BITMAP_BITS) return false; + + const offset = pn - self.base; + const idx = @as(usize, @intCast(offset / MASK_BITS)); + const bit: std.math.Log2Int(MaskInt) = @intCast(offset % MASK_BITS); + + return self.masks[idx] & (@as(MaskInt, 1) << bit) != 0; + } + + pub fn largest(self: *const PacketNumberBitmap) ?u64 { + return self.highest; + } + + /// Remove all entries below `val` by shifting the base forward. + pub fn removeBelow(self: *PacketNumberBitmap, val: u64) void { + if (val <= self.base) return; + + const shift_by = val - self.base; + if (shift_by >= BITMAP_BITS) { + // Clear everything + @memset(&self.masks, 0); + self.base = val; + self.count = 0; + // highest stays — it may be above val + return; + } + + const word_shift = @as(usize, @intCast(shift_by / MASK_BITS)); + const bit_shift: std.math.Log2Int(MaskInt) = @intCast(shift_by % MASK_BITS); + + // Count bits being removed (for accurate count tracking) + var removed: u32 = 0; + if (word_shift > 0) { + for (self.masks[0..word_shift]) |w| { + removed += @popCount(w); + } + } + // Count bits in the partial word that are being shifted out + if (bit_shift > 0 and word_shift < NUM_MASKS) { + const partial_mask = (@as(MaskInt, 1) << bit_shift) - 1; + removed += @popCount(self.masks[word_shift] & partial_mask); + } + self.count -|= removed; + + // Shift words + if (word_shift > 0) { + var i: usize = 0; + while (i + word_shift < NUM_MASKS) : (i += 1) { + self.masks[i] = self.masks[i + word_shift]; + } + // Zero the trailing words + @memset(self.masks[NUM_MASKS - word_shift ..], 0); + } + + // Shift bits within words + if (bit_shift > 0) { + const anti_shift: std.math.Log2Int(MaskInt) = @intCast(MASK_BITS - @as(usize, bit_shift)); + var i: usize = 0; + while (i < NUM_MASKS - 1) : (i += 1) { + self.masks[i] = (self.masks[i] >> bit_shift) | (self.masks[i + 1] << anti_shift); + } + self.masks[NUM_MASKS - 1] >>= bit_shift; + } + + self.base = val; + } + + /// Produce up to `max_ranges` ranges in descending order for ACK frame generation. + /// Scans the bitmap from highest to base, finding contiguous runs of 1s. + pub fn getRanges(self: *const PacketNumberBitmap, out: []Range) usize { + const hi = self.highest orelse return 0; + if (hi < self.base) return 0; + + var range_count: usize = 0; + const max_offset = hi - self.base; + + var pos: u64 = max_offset; + while (range_count < out.len) { + // Find next set bit at or below pos (scanning downward) + const end_off = self.findPrevSet(pos) orelse break; + // Find the start of this contiguous run + const start_off = self.findRunStart(end_off); + + out[range_count] = .{ + .start = self.base + start_off, + .end = self.base + end_off, + }; + range_count += 1; + + if (start_off == 0) break; + pos = start_off - 1; + } + + return range_count; + } + + /// Find the highest set bit at or below `offset` (scanning downward). + fn findPrevSet(self: *const PacketNumberBitmap, offset: u64) ?u64 { + var pos: i64 = @intCast(offset); + while (pos >= 0) { + const p: usize = @intCast(pos); + const idx = p / MASK_BITS; + const bit_in_word = p % MASK_BITS; + + // Mask off bits above our position + const mask = self.masks[idx] & ((@as(MaskInt, 2) << @as(std.math.Log2Int(MaskInt), @intCast(bit_in_word))) -% 1); + if (mask != 0) { + const highest_bit = MASK_BITS - 1 - @as(usize, @clz(mask)); + return @intCast(idx * MASK_BITS + highest_bit); + } + // Move to end of previous word + if (idx == 0) break; + pos = @as(i64, @intCast(idx * MASK_BITS)) - 1; + } + return null; + } + + /// Find the lowest offset in a contiguous run of set bits ending at `end_offset`. + fn findRunStart(self: *const PacketNumberBitmap, end_offset: u64) u64 { + if (end_offset == 0) { + return if (self.masks[0] & 1 != 0) 0 else end_offset; + } + + var pos: u64 = end_offset; + while (pos > 0) { + pos -= 1; + const idx = @as(usize, @intCast(pos / MASK_BITS)); + const bit: std.math.Log2Int(MaskInt) = @intCast(pos % MASK_BITS); + if (self.masks[idx] & (@as(MaskInt, 1) << bit) == 0) { + return pos + 1; + } + } + // Bit 0 is also set — run starts at 0 + return 0; + } + + /// Shift the window so that `pn` fits. Preserves as much history as possible. + fn shiftTo(self: *PacketNumberBitmap, pn: u64) void { + // Shift base so pn fits within the window, leaving some room for reordering. + // Place pn at 3/4 of the window to leave room for future packets. + const new_base = if (pn >= BITMAP_BITS * 3 / 4) pn - BITMAP_BITS * 3 / 4 else 0; + if (new_base > self.base) { + self.removeBelow(new_base); + } + } +}; + // Tests test "RangeSet: add single values" { @@ -267,3 +470,171 @@ test "RangeSet: out-of-order with gaps" { try rs.add(3); try testing.expectEqual(@as(usize, 1), rs.len()); } + +// PacketNumberBitmap tests + +test "PacketNumberBitmap: mark and contains" { + var bm = PacketNumberBitmap{}; + + try testing.expect(bm.mark(0)); + try testing.expect(bm.contains(0)); + try testing.expect(!bm.contains(1)); + + // Duplicate returns false + try testing.expect(!bm.mark(0)); + + try testing.expect(bm.mark(5)); + try testing.expect(bm.contains(5)); + try testing.expect(!bm.contains(3)); + try testing.expectEqual(@as(u64, 5), bm.largest().?); +} + +test "PacketNumberBitmap: sequential packets" { + var bm = PacketNumberBitmap{}; + + var i: u64 = 0; + while (i < 100) : (i += 1) { + try testing.expect(bm.mark(i)); + } + try testing.expectEqual(@as(u32, 100), bm.count); + try testing.expectEqual(@as(u64, 99), bm.largest().?); + + // All should be present + i = 0; + while (i < 100) : (i += 1) { + try testing.expect(bm.contains(i)); + } + try testing.expect(!bm.contains(100)); +} + +test "PacketNumberBitmap: getRanges descending" { + var bm = PacketNumberBitmap{}; + + // Create ranges: [0,2], gap, [5,7], gap, [10,12] + _ = bm.mark(0); + _ = bm.mark(1); + _ = bm.mark(2); + _ = bm.mark(5); + _ = bm.mark(6); + _ = bm.mark(7); + _ = bm.mark(10); + _ = bm.mark(11); + _ = bm.mark(12); + + var out: [10]Range = undefined; + const n = bm.getRanges(&out); + try testing.expectEqual(@as(usize, 3), n); + + // Descending order: highest first + try testing.expectEqual(@as(u64, 10), out[0].start); + try testing.expectEqual(@as(u64, 12), out[0].end); + try testing.expectEqual(@as(u64, 5), out[1].start); + try testing.expectEqual(@as(u64, 7), out[1].end); + try testing.expectEqual(@as(u64, 0), out[2].start); + try testing.expectEqual(@as(u64, 2), out[2].end); +} + +test "PacketNumberBitmap: removeBelow" { + var bm = PacketNumberBitmap{}; + + var i: u64 = 0; + while (i < 20) : (i += 1) { + _ = bm.mark(i); + } + + bm.removeBelow(10); + try testing.expectEqual(@as(u64, 10), bm.base); + + // 0-9 should be gone + i = 0; + while (i < 10) : (i += 1) { + try testing.expect(!bm.contains(i)); + } + // 10-19 should still be present + while (i < 20) : (i += 1) { + try testing.expect(bm.contains(i)); + } + try testing.expectEqual(@as(u32, 10), bm.count); +} + +test "PacketNumberBitmap: out-of-order with gaps" { + var bm = PacketNumberBitmap{}; + + _ = bm.mark(0); + _ = bm.mark(2); + _ = bm.mark(4); + + try testing.expect(bm.contains(0)); + try testing.expect(!bm.contains(1)); + try testing.expect(bm.contains(2)); + try testing.expect(!bm.contains(3)); + try testing.expect(bm.contains(4)); + + var out: [10]Range = undefined; + const n = bm.getRanges(&out); + try testing.expectEqual(@as(usize, 3), n); + try testing.expectEqual(@as(u64, 4), out[0].start); + try testing.expectEqual(@as(u64, 4), out[0].end); + try testing.expectEqual(@as(u64, 2), out[1].start); + try testing.expectEqual(@as(u64, 2), out[1].end); + try testing.expectEqual(@as(u64, 0), out[2].start); + try testing.expectEqual(@as(u64, 0), out[2].end); + + // Fill gaps + _ = bm.mark(1); + _ = bm.mark(3); + const n2 = bm.getRanges(&out); + try testing.expectEqual(@as(usize, 1), n2); + try testing.expectEqual(@as(u64, 0), out[0].start); + try testing.expectEqual(@as(u64, 4), out[0].end); +} + +test "PacketNumberBitmap: large packet numbers" { + var bm = PacketNumberBitmap{}; + + // Start at high PN + _ = bm.mark(1_000_000); + _ = bm.mark(1_000_001); + _ = bm.mark(1_000_002); + + try testing.expect(bm.contains(1_000_000)); + try testing.expect(bm.contains(1_000_001)); + try testing.expect(bm.contains(1_000_002)); + try testing.expect(!bm.contains(999_999)); + try testing.expectEqual(@as(u64, 1_000_002), bm.largest().?); +} + +test "PacketNumberBitmap: window shift on large jump" { + var bm = PacketNumberBitmap{}; + + // Mark some early packets + _ = bm.mark(0); + _ = bm.mark(1); + _ = bm.mark(2); + + // Jump far ahead — forces window shift + _ = bm.mark(10000); + try testing.expect(bm.contains(10000)); + // Early packets should be gone (shifted out) + try testing.expect(!bm.contains(0)); + try testing.expect(!bm.contains(1)); + try testing.expect(!bm.contains(2)); +} + +test "PacketNumberBitmap: getRanges limited output" { + var bm = PacketNumberBitmap{}; + + // Create many ranges + _ = bm.mark(0); + _ = bm.mark(2); + _ = bm.mark(4); + _ = bm.mark(6); + _ = bm.mark(8); + + // Only ask for 2 ranges (should get the highest 2) + var out: [2]Range = undefined; + const n = bm.getRanges(&out); + try testing.expectEqual(@as(usize, 2), n); + try testing.expectEqual(@as(u64, 8), out[0].start); + try testing.expectEqual(@as(u64, 6), out[1].start); +} From 9aafb73480e6562fb88fe9d6719d0f95e2226c9d Mon Sep 17 00:00:00 2001 From: Endel Dreyer Date: Thu, 19 Mar 2026 01:03:58 -0300 Subject: [PATCH 2/5] Add HyStart++ (RFC 9406) to CUBIC for better slow start exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Detects RTT increases during slow start before actual packet loss, reducing bufferbloat on the first congestion event. - Standard → CSS transition on RTT increase above eta threshold - CSS grows by MSS/4 per ACK (vs MSS in standard slow start) - CSS exits to congestion avoidance after 5 consecutive rounds - CSS returns to standard if RTT stabilizes - Round tracking via largest_sent_pn boundary - Connection passes latest_rtt and largest_sent_pn to CC on ACK --- src/quic/congestion.zig | 278 +++++++++++++++++++++++++++++++++++++++- src/quic/connection.zig | 14 +- 2 files changed, 287 insertions(+), 5 deletions(-) diff --git a/src/quic/congestion.zig b/src/quic/congestion.zig index 6f836ab..71e5a53 100644 --- a/src/quic/congestion.zig +++ b/src/quic/congestion.zig @@ -162,7 +162,7 @@ pub const NewReno = struct { } }; -/// CUBIC congestion control (RFC 8312). +/// CUBIC congestion control (RFC 8312) with HyStart++ (RFC 9406). /// /// CUBIC uses a cubic function for window growth, providing better /// bandwidth utilization on high-BDP networks compared to NewReno. @@ -171,6 +171,9 @@ pub const NewReno = struct { /// - Provides a TCP-friendly region for fairness with Reno flows /// - Beta = 0.7 (multiplicative decrease factor) /// - C = 0.4 (cubic scaling constant) +/// +/// HyStart++ (RFC 9406) improves slow start exit by detecting RTT +/// increases before actual packet loss, reducing bufferbloat. pub const Cubic = struct { /// Current congestion window in bytes. congestion_window: u64, @@ -207,6 +210,37 @@ pub const Cubic = struct { /// W_est: estimated TCP-friendly window (for TCP-friendly region). w_est: u64 = 0, + // ── HyStart++ state (RFC 9406) ── + + /// Current slow start phase + ss_phase: SlowStartPhase = .standard, + + /// Packet number at which the current round ends. + /// A round ends when an ACK acknowledges a packet >= round_end_pn. + round_end_pn: ?u64 = null, + + /// Minimum RTT observed in the current round (nanoseconds). + current_round_min_rtt: i64 = std.math.maxInt(i64), + + /// Baseline min RTT from the previous round (nanoseconds). + /// Used to detect RTT increases. + last_round_min_rtt: i64 = std.math.maxInt(i64), + + /// Number of RTT samples collected in the current round. + rtt_sample_count: u32 = 0, + + /// Number of CSS rounds completed. + css_rounds: u32 = 0, + + /// cwnd at the beginning of CSS (for ssthresh calculation). + css_baseline_cwnd: u64 = 0, + + const SlowStartPhase = enum { + standard, // Normal exponential slow start + css, // Conservative Slow Start + done, // Exited slow start (won't re-enter) + }; + // ── Constants ── // Beta = 0.7 (RFC 8312 §4.5 recommends 0.7 for QUIC) const BETA_NUM: u64 = 7; @@ -217,6 +251,13 @@ pub const Cubic = struct { const C_SCALED: u64 = 400; const C_DENOM: u64 = 1000; + // HyStart++ constants (RFC 9406 §4.2) + const MIN_RTT_THRESH: i64 = 4_000_000; // 4ms in ns + const MAX_RTT_THRESH: i64 = 16_000_000; // 16ms in ns + const N_RTT_SAMPLE: u32 = 8; // Min samples per round + const CSS_GROWTH_DIVISOR: u64 = 4; // cwnd += MSS/4 per ACK in CSS + const CSS_ROUNDS: u32 = 5; // CSS rounds before exiting to CA + pub fn inSlowStart(self: *const Cubic) bool { return self.congestion_window < self.ssthresh; } @@ -242,7 +283,14 @@ pub const Cubic = struct { } /// Called when a packet is acknowledged. + /// `latest_rtt_ns` is the RTT sample from this ACK (0 if not a new sample). + /// `largest_pn` is the largest packet number being sent (for round tracking). pub fn onPacketAcked(self: *Cubic, acked_bytes: u64, sent_time: i64) void { + self.onPacketAckedWithRtt(acked_bytes, sent_time, 0, null); + } + + /// Extended version with RTT sample for HyStart++. + pub fn onPacketAckedWithRtt(self: *Cubic, acked_bytes: u64, sent_time: i64, latest_rtt_ns: i64, largest_sent_pn: ?u64) void { // Don't grow window during recovery if (self.inCongestionRecovery(sent_time)) return; @@ -250,8 +298,7 @@ pub const Cubic = struct { if (self.app_limited) return; if (self.inSlowStart()) { - // Slow start: increase by acked_bytes (same as NewReno) - self.congestion_window += acked_bytes; + self.slowStartOnAck(acked_bytes, latest_rtt_ns, largest_sent_pn); return; } @@ -259,6 +306,94 @@ pub const Cubic = struct { self.cubicUpdate(acked_bytes); } + /// HyStart++ slow start logic (RFC 9406). + fn slowStartOnAck(self: *Cubic, acked_bytes: u64, latest_rtt_ns: i64, largest_sent_pn: ?u64) void { + // Initialize first round if needed + if (self.round_end_pn == null) { + if (largest_sent_pn) |pn| { + self.round_end_pn = pn + 1; + } + } + + // Sample RTT + if (latest_rtt_ns > 0) { + self.rtt_sample_count += 1; + if (latest_rtt_ns < self.current_round_min_rtt) { + self.current_round_min_rtt = latest_rtt_ns; + } + } + + switch (self.ss_phase) { + .standard => { + // Normal exponential growth + self.congestion_window += acked_bytes; + + // Check for round end + if (self.isRoundEnd(largest_sent_pn)) { + // Enough samples? Check RTT increase. + if (self.rtt_sample_count >= N_RTT_SAMPLE and + self.last_round_min_rtt < std.math.maxInt(i64)) + { + const eta = clampRttThresh(self.last_round_min_rtt); + if (self.current_round_min_rtt >= self.last_round_min_rtt + eta) { + // RTT increased → enter CSS + self.ss_phase = .css; + self.css_rounds = 0; + self.css_baseline_cwnd = self.congestion_window; + } + } + self.advanceRound(largest_sent_pn); + } + }, + .css => { + // Conservative Slow Start: grow by MSS / CSS_GROWTH_DIVISOR per ACK + self.congestion_window += self.max_datagram_size / CSS_GROWTH_DIVISOR; + + // Check for round end + if (self.isRoundEnd(largest_sent_pn)) { + self.css_rounds += 1; + + if (self.rtt_sample_count >= N_RTT_SAMPLE and + self.last_round_min_rtt < std.math.maxInt(i64)) + { + const eta = clampRttThresh(self.last_round_min_rtt); + if (self.current_round_min_rtt < self.last_round_min_rtt + eta) { + // RTT stabilized → back to standard slow start + self.ss_phase = .standard; + } + } + + if (self.css_rounds >= CSS_ROUNDS) { + // Exit slow start → congestion avoidance + self.ssthresh = self.congestion_window; + self.ss_phase = .done; + } + + self.advanceRound(largest_sent_pn); + } + }, + .done => { + // Shouldn't reach here (inSlowStart would be false) + self.cubicUpdate(acked_bytes); + }, + } + } + + /// Check if the current round has ended. + fn isRoundEnd(self: *const Cubic, largest_sent_pn: ?u64) bool { + const end_pn = self.round_end_pn orelse return false; + const sent_pn = largest_sent_pn orelse return false; + return sent_pn >= end_pn; + } + + /// Advance to the next round: save RTT stats and set new round boundary. + fn advanceRound(self: *Cubic, largest_sent_pn: ?u64) void { + self.last_round_min_rtt = self.current_round_min_rtt; + self.current_round_min_rtt = std.math.maxInt(i64); + self.rtt_sample_count = 0; + self.round_end_pn = if (largest_sent_pn) |pn| pn + 1 else null; + } + /// CUBIC window update during congestion avoidance. fn cubicUpdate(self: *Cubic, acked_bytes: u64) void { // Initialize epoch on first ACK after congestion event @@ -351,6 +486,9 @@ pub const Cubic = struct { MIN_WINDOW_PACKETS * self.max_datagram_size, ); self.bytes_acked_in_round = 0; + + // HyStart++: loss during slow start means we're done + self.ss_phase = .done; } /// Persistent congestion: reset to minimum window (RFC 9002 §7.6.2). @@ -361,6 +499,14 @@ pub const Cubic = struct { self.epoch_start = null; self.w_max = 0; self.bytes_acked_in_round = 0; + + // HyStart++: reset to standard for the new slow start phase + self.ss_phase = .standard; + self.round_end_pn = null; + self.current_round_min_rtt = std.math.maxInt(i64); + self.last_round_min_rtt = std.math.maxInt(i64); + self.rtt_sample_count = 0; + self.css_rounds = 0; } /// PTO does not reduce window (RFC 9002 §7.5). @@ -386,6 +532,13 @@ pub const Cubic = struct { } }; +/// HyStart++ RTT threshold clamp (RFC 9406 §4.2): +/// eta = clamp(lastRoundMinRTT / 8, MIN_RTT_THRESH, MAX_RTT_THRESH) +fn clampRttThresh(last_round_min_rtt: i64) i64 { + const eighth = @divTrunc(last_round_min_rtt, 8); + return @max(Cubic.MIN_RTT_THRESH, @min(eighth, Cubic.MAX_RTT_THRESH)); +} + /// Integer cube root approximation (Newton's method). fn icbrt(x: u64) u64 { if (x == 0) return 0; @@ -692,6 +845,125 @@ test "Cubic: PTO does not reduce window" { try testing.expectEqual(window_before, cc.congestion_window); } +// ── HyStart++ tests ── + +test "HyStart++: enters CSS on RTT increase" { + var cc = Cubic.init(); + try testing.expect(cc.inSlowStart()); + try testing.expectEqual(Cubic.SlowStartPhase.standard, cc.ss_phase); + + const base_rtt: i64 = 20_000_000; // 20ms + + // Simulate: sent packets 0..9, then ACK them all. + // largest_sent_pn=9 stays fixed during the entire ACK batch. + // First ACK initializes round_end_pn to 10. + // No round ends during this batch (all ACKed PNs < 10). + var i: u32 = 0; + while (i < Cubic.N_RTT_SAMPLE) : (i += 1) { + cc.onPacketAckedWithRtt(1200, 100, base_rtt, 9); + } + // round_end_pn = 10, no round ended yet (9 < 10) + try testing.expect(cc.round_end_pn != null); + try testing.expectEqual(@as(u64, 10), cc.round_end_pn.?); + try testing.expectEqual(@as(u32, 8), cc.rtt_sample_count); + + // Now simulate: sent packets 10..19, ACK them. + // largest_sent_pn=19. First ACK with pn >= 10 triggers round end. + cc.onPacketAckedWithRtt(1200, 200, base_rtt, 19); + // Round ended: last_round_min_rtt = base_rtt, round_end_pn = 20 + try testing.expectEqual(base_rtt, cc.last_round_min_rtt); + try testing.expectEqual(@as(u64, 20), cc.round_end_pn.?); + + // Continue ACKing round 2 with increased RTT (+5ms > eta=4ms) + const increased_rtt: i64 = base_rtt + 5_000_000; + i = 0; + while (i < Cubic.N_RTT_SAMPLE - 1) : (i += 1) { + cc.onPacketAckedWithRtt(1200, 200, increased_rtt, 19); + } + // 8 total samples in round 2 (1 from above + 7 here), round hasn't ended yet + + // Send packets 20..29, ACK triggers next round end + cc.onPacketAckedWithRtt(1200, 300, increased_rtt, 29); + // Round ended: RTT increased above threshold → should enter CSS + try testing.expectEqual(Cubic.SlowStartPhase.css, cc.ss_phase); +} + +test "HyStart++: CSS exits to CA after CSS_ROUNDS" { + var cc = Cubic.init(); + + // Establish baseline RTT + const base_rtt: i64 = 20_000_000; + var pn: u64 = 0; + + // First round: baseline + var i: u32 = 0; + while (i < Cubic.N_RTT_SAMPLE) : (i += 1) { + cc.onPacketAckedWithRtt(1200, 100, base_rtt, pn); + pn += 1; + } + + // Force into CSS + cc.ss_phase = .css; + cc.css_rounds = 0; + cc.css_baseline_cwnd = cc.congestion_window; + + // Run CSS_ROUNDS rounds with increasing RTT + var round: u32 = 0; + while (round < Cubic.CSS_ROUNDS) : (round += 1) { + const rtt = base_rtt + @as(i64, @intCast(round + 1)) * 2_000_000; + i = 0; + while (i < Cubic.N_RTT_SAMPLE) : (i += 1) { + cc.onPacketAckedWithRtt(1200, 200, rtt, pn); + pn += 1; + } + } + + // Should have exited slow start + try testing.expectEqual(Cubic.SlowStartPhase.done, cc.ss_phase); + try testing.expect(!cc.inSlowStart()); // ssthresh was set +} + +test "HyStart++: loss during slow start sets phase to done" { + var cc = Cubic.init(); + try testing.expectEqual(Cubic.SlowStartPhase.standard, cc.ss_phase); + + cc.onCongestionEvent(100, 200); + try testing.expectEqual(Cubic.SlowStartPhase.done, cc.ss_phase); +} + +test "HyStart++: persistent congestion resets to standard" { + var cc = Cubic.init(); + cc.ss_phase = .css; + cc.css_rounds = 3; + + cc.onPersistentCongestion(1_000_000_000); + try testing.expectEqual(Cubic.SlowStartPhase.standard, cc.ss_phase); + try testing.expectEqual(@as(u32, 0), cc.css_rounds); + try testing.expect(cc.inSlowStart()); +} + +test "HyStart++: CSS grows slower than standard" { + var cc_std = Cubic.init(); + var cc_css = Cubic.init(); + cc_css.ss_phase = .css; + cc_css.css_baseline_cwnd = cc_css.congestion_window; + + const initial_std = cc_std.congestion_window; + const initial_css = cc_css.congestion_window; + + // Same ACK to both + cc_std.onPacketAckedWithRtt(1200, 100, 20_000_000, 10); + cc_css.onPacketAckedWithRtt(1200, 100, 20_000_000, 10); + + const growth_std = cc_std.congestion_window - initial_std; + const growth_css = cc_css.congestion_window - initial_css; + + // CSS should grow slower + try testing.expect(growth_css < growth_std); + // CSS growth = MSS / CSS_GROWTH_DIVISOR = 1200 / 4 = 300 + try testing.expectEqual(@as(u64, DEFAULT_MAX_DATAGRAM_SIZE / Cubic.CSS_GROWTH_DIVISOR), growth_css); +} + // ── icbrt tests ── test "icbrt: basic cube roots" { diff --git a/src/quic/connection.zig b/src/quic/connection.zig index 18d3a29..e3a0c66 100644 --- a/src/quic/connection.zig +++ b/src/quic/connection.zig @@ -1345,7 +1345,12 @@ pub const Connection = struct { std.log.info("PMTUD: probe ACK'd, MTU raised to {d}", .{new_mtu}); } - self.cc.onPacketAcked(pkt.size, pkt.time_sent); + self.cc.onPacketAckedWithRtt( + pkt.size, + pkt.time_sent, + self.pkt_handler.rtt_stats.latest_rtt, + if (self.pkt_handler.sent[@intFromEnum(enc_level)].largest_sent) |ls| ls else null, + ); // Track whether a packet sent with current keys has been ACKed if (enc_level == .application) { @@ -1452,7 +1457,12 @@ pub const Connection = struct { self.pacer.max_datagram_size = new_mtu; std.log.info("PMTUD: probe ACK'd, MTU raised to {d}", .{new_mtu}); } - self.cc.onPacketAcked(pkt.size, pkt.time_sent); + self.cc.onPacketAckedWithRtt( + pkt.size, + pkt.time_sent, + self.pkt_handler.rtt_stats.latest_rtt, + if (self.pkt_handler.sent[@intFromEnum(enc_level)].largest_sent) |ls| ls else null, + ); // Stop including HANDSHAKE_DONE once a packet containing it is ACKed if (pkt.has_handshake_done) { From 3f6299226fc66b17bb50cb5536d49be553b34218 Mon Sep 17 00:00:00 2001 From: Endel Dreyer Date: Thu, 19 Mar 2026 01:09:48 -0300 Subject: [PATCH 3/5] Add sub-millisecond pacing via inline micro-sleep in send loop When the pacer requests a delay < 1ms, sleep inline in the send loop instead of returning to the event loop (which has only ms-resolution timers). This gives ~microsecond pacing precision for high-throughput transfers, similar to the Linux kernel QUIC's hrtimer approach. Applied to both Server and Client event loops. --- src/event_loop.zig | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/event_loop.zig b/src/event_loop.zig index a56c12b..855971e 100644 --- a/src/event_loop.zig +++ b/src/event_loop.zig @@ -901,7 +901,21 @@ pub fn Server(comptime Handler: type) type { var send_count: usize = 0; while (send_count < max_burst_packets) : (send_count += 1) { const bytes_written = conn.send(&self.out_buf) catch break; - if (bytes_written == 0) break; + if (bytes_written == 0) { + // Sub-ms pacing: if the pacer wants a short delay (< 1ms), + // spin-wait here instead of returning to the event loop + // which has only ms-resolution timers. This gives us ~µs + // pacing precision similar to the kernel's hrtimer approach. + if (conn.pacer.bandwidth_shifted > 0) { + const now_ns: i64 = @intCast(std.time.nanoTimestamp()); + const pacer_delay = conn.pacer.timeUntilSend(now_ns); + if (pacer_delay > 0 and pacer_delay < 1_000_000) { // < 1ms + std.Thread.sleep(@intCast(pacer_delay)); + continue; // Retry send after micro-sleep + } + } + break; + } const send_addr = conn.peerAddress(); batch.add( @@ -1710,7 +1724,18 @@ pub fn Client(comptime Handler: type) type { var send_count: usize = 0; while (send_count < max_burst_packets) : (send_count += 1) { const bytes_written = conn.send(&self.out_buf) catch break; - if (bytes_written == 0) break; + if (bytes_written == 0) { + // Sub-ms pacing spin-wait (see server tickAndSend for details) + if (conn.pacer.bandwidth_shifted > 0) { + const now_ns: i64 = @intCast(std.time.nanoTimestamp()); + const pacer_delay = conn.pacer.timeUntilSend(now_ns); + if (pacer_delay > 0 and pacer_delay < 1_000_000) { + std.Thread.sleep(@intCast(pacer_delay)); + continue; + } + } + break; + } self.batch.add( self.out_buf[0..bytes_written], @ptrCast(send_addr), From 0036a5c3ab099de22d12acfafececb26f89d2dc1 Mon Sep 17 00:00:00 2001 From: Endel Dreyer Date: Thu, 19 Mar 2026 01:12:41 -0300 Subject: [PATCH 4/5] Add inline fast path to FrameSorter for in-order stream delivery For the common case where stream data arrives in order (offset == read_pos and no buffered chunks), stores data in a fixed 1500-byte inline buffer instead of going through the AutoArrayHashMap. This eliminates hash map insert/lookup/orderedRemove operations on the hot path. Inspired by lxin/quic kernel's approach of minimizing per-frame allocation. The pop() path still returns heap-allocated data for caller ownership compatibility, but the push() path saves hash map overhead. --- src/quic/stream.zig | 53 +++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/src/quic/stream.zig b/src/quic/stream.zig index 4c0b472..d9f0d31 100644 --- a/src/quic/stream.zig +++ b/src/quic/stream.zig @@ -38,10 +38,16 @@ pub fn isLocal(stream_id: u64, is_server: bool) bool { /// Gap-based frame sorter for out-of-order reassembly of stream data. /// Tracks received byte ranges and returns contiguous data starting from read_pos. +/// +/// Optimization (inspired by lxin/quic kernel): inline fast path for the common +/// in-order case. When data arrives exactly at read_pos with no out-of-order chunks +/// buffered, it's stored in a fixed inline buffer (zero heap allocation). pub const FrameSorter = struct { + const INLINE_BUF_SIZE = 1500; + allocator: Allocator, - /// Buffered data chunks, keyed by offset. + /// Buffered data chunks, keyed by offset. Used for out-of-order delivery. chunks: std.AutoArrayHashMap(u64, []const u8), /// Next offset to be read by the application. @@ -50,6 +56,11 @@ pub const FrameSorter = struct { /// The final offset (set when FIN is received). fin_offset: ?u64 = null, + /// Inline buffer for zero-alloc in-order fast path. + /// When data arrives at read_pos and chunks map is empty, copy here instead. + inline_buf: [INLINE_BUF_SIZE]u8 = undefined, + inline_len: u16 = 0, + pub fn init(allocator: Allocator) FrameSorter { return .{ .allocator = allocator, @@ -58,7 +69,6 @@ pub const FrameSorter = struct { } pub fn deinit(self: *FrameSorter) void { - // Free any owned data for (self.chunks.values()) |data| { self.allocator.free(data); } @@ -68,6 +78,9 @@ pub const FrameSorter = struct { /// Return the highest byte offset buffered (or read_pos if no chunks). pub fn highestReceived(self: *const FrameSorter) u64 { var highest = self.read_pos; + if (self.inline_len > 0) { + highest = self.read_pos + self.inline_len; + } for (self.chunks.keys(), self.chunks.values()) |off, val| { const end = off + val.len; if (end > highest) highest = end; @@ -79,24 +92,20 @@ pub const FrameSorter = struct { pub fn push(self: *FrameSorter, offset: u64, data: []const u8, fin: bool) !void { if (fin) { const new_fin = offset + data.len; - // RFC 9000 §4.5: final size cannot change once known if (self.fin_offset) |existing| { if (existing != new_fin) return error.FinalSizeError; } self.fin_offset = new_fin; } - // RFC 9000 §4.5: data cannot exceed known final size if (self.fin_offset) |fs| { if (offset + data.len > fs) return error.FinalSizeError; } if (data.len == 0) return; - // Skip data that's already been read if (offset + data.len <= self.read_pos) return; - // Trim data that partially overlaps with already-read region var effective_offset = offset; var effective_data = data; if (offset < self.read_pos) { @@ -105,20 +114,26 @@ pub const FrameSorter = struct { effective_offset = self.read_pos; } - // Check if there's already a chunk at this offset. - // Don't overwrite a longer chunk with a shorter one (retransmission - // with different fragmentation boundaries). Also free old data to - // prevent memory leaks. + // Fast path: in-order delivery, no buffered chunks, fits inline. + // Avoids hash map insertion + heap allocation entirely. + if (effective_offset == self.read_pos and + self.inline_len == 0 and + self.chunks.count() == 0 and + effective_data.len <= INLINE_BUF_SIZE) + { + @memcpy(self.inline_buf[0..effective_data.len], effective_data); + self.inline_len = @intCast(effective_data.len); + return; + } + + // Slow path: out-of-order or inline buffer occupied if (self.chunks.get(effective_offset)) |existing| { if (existing.len >= effective_data.len) { - // Existing chunk covers at least as much data — skip. return; } - // New chunk is longer — free old, overwrite below. self.allocator.free(existing); } - // Copy data to owned buffer const owned = try self.allocator.dupe(u8, effective_data); errdefer self.allocator.free(owned); @@ -127,7 +142,17 @@ pub const FrameSorter = struct { /// Pop the next contiguous chunk of data from the read position. /// Returns null if there's no data available at the current read position. + /// Returned data is always heap-allocated — caller must free. pub fn pop(self: *FrameSorter) ?[]const u8 { + // Fast path: inline buffer → dupe to heap for caller ownership. + // The win is push() avoided hash map operations entirely. + if (self.inline_len > 0) { + const len = self.inline_len; + self.inline_len = 0; + self.read_pos += len; + return self.allocator.dupe(u8, self.inline_buf[0..len]) catch null; + } + if (self.chunks.get(self.read_pos)) |data| { _ = self.chunks.orderedRemove(self.read_pos); self.read_pos += data.len; @@ -139,7 +164,7 @@ pub const FrameSorter = struct { /// Check if all data has been received (FIN reached and all data consumed). pub fn isComplete(self: *const FrameSorter) bool { if (self.fin_offset) |fin| { - return self.read_pos >= fin; + return self.read_pos >= fin and self.inline_len == 0; } return false; } From 9ff3e255d68ef9b6833dbe2f08c7118e9b20d5e2 Mon Sep 17 00:00:00 2001 From: Endel Dreyer Date: Thu, 19 Mar 2026 01:16:44 -0300 Subject: [PATCH 5/5] FrameSorter: inline fast path + O(1) swapRemove for pop Two optimizations for the stream frame reassembly hot path: 1. Inline buffer: when data arrives in-order (offset == read_pos) and no out-of-order chunks are buffered, copy to a fixed 1500-byte inline buffer instead of going through AutoArrayHashMap operations. Avoids hash map insert on push for the common case. 2. swapRemove instead of orderedRemove in pop(): O(1) instead of O(n) for removing consumed chunks from the hash map. Order doesn't matter since lookup is by key (offset), not position. --- src/quic/stream.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/quic/stream.zig b/src/quic/stream.zig index d9f0d31..1469272 100644 --- a/src/quic/stream.zig +++ b/src/quic/stream.zig @@ -154,7 +154,7 @@ pub const FrameSorter = struct { } if (self.chunks.get(self.read_pos)) |data| { - _ = self.chunks.orderedRemove(self.read_pos); + _ = self.chunks.swapRemove(self.read_pos); self.read_pos += data.len; return data; }