diff --git a/CHANGELOG.md b/CHANGELOG.md index 51ff98a..678129d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,57 @@ All notable changes to this project will be documented in this file. Format based on [Keep a Changelog](https://keepachangelog.com/). +## [0.2.1] - unreleased + +### Added — wg-relay hardening + +- **WG-shape filter at XDP**: drops packets whose first byte + isn't a WireGuard message type (1/2/3/4) or whose length + doesn't match the type. Fires before the source-IP lookup, + so non-WG noise on the relay's port stops at the NIC and + doesn't pollute `drop_unknown_src`. New counter + `drop_not_wg_shaped`. +- **MAC1 verification for handshakes**: when both ends of a + link have `wg peer pubkey` stamped, every handshake init + / response from a registered peer is verified against the + partner's pubkey via Blake2s-keyed MAC1. Mismatch drops + with `drop_handshake_pubkey_mismatch`. Catches misconfigured + clients (wrong relay, NAT collisions, stale endpoint reuse). + Engages only when the partner pubkey is set, so existing + operators keep today's behaviour exactly. +- **Automatic peer roaming** (`mode: wireguard`): a peer's + endpoint auto-updates when their IP changes. Handshake from + an unknown source is matched against every partner's pubkey + via MAC1 to identify which peer it's from; a candidate + endpoint is registered. The committed endpoint stays put + until transport-data from the candidate confirms the roam, + at which point the new endpoint is committed, the BPF map + is refreshed (XDP fast path picks up the new endpoint), and + the roster is persisted. New per-peer counter + `endpoint_relearn`. Forged handshakes (attacker who knows + the pubkey but lacks the private key) tick + `drop_relearn_unconfirmed` and never commit. +- **Dynamic source-IP blocklist**: source IPs that produce + repeated failed-confirm strikes (forged handshakes that + never progressed to transport data) escalate onto a BPF + blocklist. Defaults: 2 strikes / 60 s → 60 s block; + 5 / 1 h → 1 h block; 10 / 24 h → 24 h block. Blocked + sources drop at the top of XDP. Closes the relay-as- + anonymizer attack against the partner. New verb + `wg blocklist list`. New counters `drop_blocklisted` (XDP) + and per-IP strike records. +- **Endpoint-hijack defense**: a forged handshake init must receive a partner-attributable response (the partner's type-2 `receiver_index` matches the init's `sender_index`) before the candidate slot is allowed to confirm. A forger who can pass MAC1 but lacks the static-key handshake can no longer bounce the candidate to confirm by sending a matching-shaped transport-data packet of their own. +- **Type-2 from unknown source dropped outright**: legitimate handshake responses come from the committed responder endpoint, so an unknown-source type-2 has no place in the protocol and was an unauthenticated amplifier surface. +- **Retry-init forward rate-limit**: while a candidate is unconfirmed, the no-op-forward branch caps retry forwards at one per second per source. Legitimate `wg.ko` retries every 5 s; a flood of forged retries is clamped and then strikes into the blocklist. +- **Strike-table sweep**: stale strike entries (older than the widest policy window) are pruned during candidate expiry so spoofed-source one-shot strikes can't grow the table without bound. + +### Added — Crypto + +- **Standalone Blake2s** in `src/crypto/`. RFC 7693 reference + port; libsodium ships Blake2b only and WireGuard's MAC1 + uses Blake2s specifically. Verified against the published + "abc" / empty-input vectors. + ## [0.2.0] - 2026-04-26 ### Added — WireGuard Relay Mode diff --git a/CMakeLists.txt b/CMakeLists.txt index 6019e77..dcd5b61 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,7 @@ endif() project(hyper-derp LANGUAGES C CXX DESCRIPTION "High-performance DERP relay server" - VERSION 0.2.0 + VERSION 0.2.1 ) set(CMAKE_CXX_STANDARD 23) diff --git a/bpf/wg_relay.bpf.c b/bpf/wg_relay.bpf.c index 72aad0a..2d847d3 100644 --- a/bpf/wg_relay.bpf.c +++ b/bpf/wg_relay.bpf.c @@ -27,11 +27,14 @@ #include #include -// Stats indices. -#define STAT_RX 0 -#define STAT_FWD 1 -#define STAT_PASS_NO_PEER 2 -#define STAT_PASS_NO_MAC 3 +// Stats indices. Keep in sync with WgXdpStats in +// include/hyper_derp/wg_relay.h. +#define STAT_RX 0 +#define STAT_FWD 1 +#define STAT_PASS_NO_PEER 2 +#define STAT_PASS_NO_MAC 3 +#define STAT_DROP_NOT_WG_SHAPED 4 +#define STAT_DROP_BLOCKLISTED 5 // -- Map types -------------------------------------------- @@ -91,7 +94,7 @@ struct { struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, 4); + __uint(max_entries, 8); __type(key, __u32); __type(value, __u64); } wg_xdp_stats SEC(".maps"); @@ -140,6 +143,26 @@ struct { __type(value, __u32); // ipv4 in network byte order } wg_nic_ips SEC(".maps"); +// Blocklist for source IPs that produced repeated failed +// candidate confirmations (i.e. forged handshakes — they had +// the partner's pubkey but couldn't progress to transport +// data because they don't have the static private key). +// Userspace populates expiry_ns from CLOCK_MONOTONIC; the +// BPF program compares against bpf_ktime_get_ns(). An +// entry whose expiry has passed is treated as not present — +// userspace sweeps the map periodically but a stale-but- +// expired entry is harmless either way. +struct blocklist_entry { + __u64 expiry_ns; // monotonic ns; 0 = no longer active +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 8192); + __type(key, __u32); // IPv4 src in network byte order + __type(value, struct blocklist_entry); +} wg_blocklist SEC(".maps"); + // -- Helpers ---------------------------------------------- static __always_inline void @@ -205,6 +228,73 @@ int wg_relay_xdp(struct xdp_md *ctx) inc_stat(STAT_RX); + // Dynamic blocklist — drop sources that produced + // repeated failed candidate confirmations. Stale + // entries (expiry in the past) fall through. + { + __u32 src_ip = ip->saddr; + struct blocklist_entry *bl = + bpf_map_lookup_elem(&wg_blocklist, &src_ip); + if (bl && bl->expiry_ns > bpf_ktime_get_ns()) { + inc_stat(STAT_DROP_BLOCKLISTED); + return XDP_DROP; + } + } + + // WG-shape filter — peek at the first byte of the UDP + // payload and verify it's a WireGuard message type + // (1 init, 2 response, 3 cookie, 4 transport). Anything + // else is either malformed or a non-WG client that ended + // up at the relay's port; dropping at XDP keeps it off + // the forward path entirely so the partner never has to + // process it. Length sanity covers the fixed-size types; + // transport-data has variable length capped by MTU. + __u8 *wg = (void *)(udp + 1); + if ((void *)(wg + 1) > data_end) { + inc_stat(STAT_DROP_NOT_WG_SHAPED); + return XDP_DROP; + } + __u16 udp_payload_len = + bpf_ntohs(udp->len) - sizeof(struct udphdr); + __u8 wg_type = wg[0]; + if (wg_type == 1) { + if (udp_payload_len != 148) { + inc_stat(STAT_DROP_NOT_WG_SHAPED); + return XDP_DROP; + } + } else if (wg_type == 2) { + if (udp_payload_len != 92) { + inc_stat(STAT_DROP_NOT_WG_SHAPED); + return XDP_DROP; + } + } else if (wg_type == 3) { + if (udp_payload_len != 64) { + inc_stat(STAT_DROP_NOT_WG_SHAPED); + return XDP_DROP; + } + } else if (wg_type == 4) { + // Transport data: header (16 B) + counter (8 B) + + // at least the AEAD tag (16 B). + if (udp_payload_len < 32) { + inc_stat(STAT_DROP_NOT_WG_SHAPED); + return XDP_DROP; + } + } else { + inc_stat(STAT_DROP_NOT_WG_SHAPED); + return XDP_DROP; + } + + // Hand handshake init (1) and response (2) packets up + // to userspace: it owns the MAC1 verification + the + // candidate-then-confirm roaming flow, neither of which + // fits the XDP verifier comfortably and both of which + // are rare enough (one handshake per session per ~25 s) + // that the userspace round trip is free. Cookie reply + // (3) and transport data (4) keep the XDP fast path. + if (wg_type == 1 || wg_type == 2) { + return XDP_PASS; + } + // Look up the source endpoint in the peer map. Miss // means either an unregistered peer or one whose // source IP/port doesn't match the operator's pin — diff --git a/cmake/libderp.cmake b/cmake/libderp.cmake index 6b81ce2..c199013 100644 --- a/cmake/libderp.cmake +++ b/cmake/libderp.cmake @@ -45,6 +45,7 @@ add_library(libderp_obj OBJECT src/einheit_protocol.cc src/einheit_channel.cc src/wg_relay.cc + src/crypto/blake2s.cc ) target_include_directories(libderp_obj PUBLIC ${PROJECT_SOURCE_DIR}/include @@ -53,6 +54,11 @@ target_include_directories(libderp_obj PUBLIC ${BPF_INCLUDE_DIR} ${ZMQ_INCLUDE_DIR} ) +target_include_directories(libderp_obj PRIVATE + # Internal-only headers (not part of the public include/ + # tree) — e.g. src/crypto/blake2s.h. + ${PROJECT_SOURCE_DIR}/src +) target_link_libraries(libderp_obj PUBLIC ${URING_LIB} ${SODIUM_LIB} diff --git a/dist/release-notes/v0.2.1.md b/dist/release-notes/v0.2.1.md new file mode 100644 index 0000000..8831f90 --- /dev/null +++ b/dist/release-notes/v0.2.1.md @@ -0,0 +1,50 @@ +## What's new + +Hardening for `mode: wireguard` plus **automatic peer roaming**. + +### Automatic roaming + +When a WG peer's IP changes — laptop changes networks, a CGNAT rebind, the home ISP renewed the DHCP lease — their tunnel through the relay used to break until an operator manually updated the roster. Now the relay recognises the peer's next handshake from the new IP via MAC1 verification against the link partner's stamped pubkey, candidate-registers the new endpoint, mirrors the partner's response to the candidate so the handshake completes, and commits the new endpoint once transport-data confirms the roam. Operator does nothing; tunnel comes back on its own. + +The "tentative-then-confirm" gate makes this safe: an attacker who knows the partner's pubkey can forge a handshake init, but they can't progress to transport-data without the static private key, so the candidate expires uncommitted and the original endpoint stays put. + +### Dynamic blocklist + +Repeated failed-confirm attempts from the same source IP (i.e. forged handshakes from someone who has the pubkey but not the keys) escalate onto a BPF blocklist: + +- 2 strikes / 60 s → 60 s block +- 5 strikes / 1 h → 1 h block +- 10 strikes / 24 h → 24 h block + +Blocked sources drop at the top of XDP — they can't even reach the forward path, so the relay stops being the anonymization layer for the attacker. New `wg blocklist list` shows what's currently blocked. + +### Other hardening + +- **WG-shape filter** at XDP — drops UDP/51820 packets whose first byte isn't a WireGuard message type. Stops non-WG noise (port scans, misdirected clients) from polluting counters. +- **MAC1 verification on handshakes from registered sources** when both ends have a stamped pubkey — catches misconfigured clients pointing at the wrong relay. + +### New counters in `wg show` + +- `drop_not_wg_shaped` +- `drop_handshake_pubkey_mismatch` +- `drop_handshake_no_pubkey_match` +- `drop_relearn_unconfirmed` +- `xdp_drop_blocklisted` + +A non-zero `drop_relearn_unconfirmed` is the canonical "someone is forging handshakes" signal. + +## Install + +```bash +sudo apt update && sudo apt install hyper-derp +``` + +(If you haven't added the repo: see the [v0.2.0 install instructions](https://github.com/hyper-derp/Hyper-DERP/releases/tag/v0.2.0).) + +## Compatibility + +- All 0.2.0 behaviour is unchanged unless you stamp pubkeys via `wg peer pubkey` — the new MAC1 path engages only for links with both ends' pubkeys on file. +- No CLI verb removals. New verb: `wg blocklist list`. +- Roster format extended with new optional per-peer fields (`endpoint_relearn`); old rosters load unchanged. + +Full changelog: [CHANGELOG.md](https://github.com/hyper-derp/Hyper-DERP/blob/v0.2.x/CHANGELOG.md) · Design notes: [docs/design/wg_relay_pubkey_filter.md](https://github.com/hyper-derp/Hyper-DERP/blob/v0.2.x/docs/design/wg_relay_pubkey_filter.md), [docs/design/wg_relay_hardening.md](https://github.com/hyper-derp/Hyper-DERP/blob/v0.2.x/docs/design/wg_relay_hardening.md) diff --git a/include/hyper_derp/wg_relay.h b/include/hyper_derp/wg_relay.h index e5bedaf..2b1cd7e 100644 --- a/include/hyper_derp/wg_relay.h +++ b/include/hyper_derp/wg_relay.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "hyper_derp/server.h" @@ -64,6 +65,49 @@ struct WgRelayPeer { /// Populates the BPF peer entry's ifindex so /// XDP_REDIRECT can pick the right egress NIC. std::string nic; + /// Times this peer's `endpoint` was relearned via the + /// MAC1-driven roaming flow. Persisted to the roster. + uint64_t endpoint_relearn = 0; + /// Pending relearn-candidate, populated when an unknown + /// source presents a handshake with valid MAC1 against + /// this peer's link partner. Cleared on confirm (transport + /// data flowed from the candidate) or expiry (no transport + /// data within 30 s). The committed `endpoint` above stays + /// untouched until confirm. + struct sockaddr_storage candidate_endpoint{}; + socklen_t candidate_endpoint_len = 0; + uint64_t candidate_set_ns = 0; + /// `sender_index` from the candidate's handshake init (the + /// initiator-side session id, bytes 4..8 of the type-1 + /// packet). The matching handshake response from the + /// partner echoes this in its `receiver_index` field. + /// We only set candidate_partner_responded when we see + /// a response whose receiver_index matches — that proves + /// the response was for THIS candidate's init, not for an + /// unrelated concurrent handshake from the legitimate + /// peer at the committed endpoint. + uint32_t candidate_init_sender_index = 0; + /// True once we've forwarded a partner response whose + /// receiver_index matches candidate_init_sender_index. + /// ConfirmCandidateLocked refuses to commit until this is + /// set, which closes the "forger sends type-1 + type-4" + /// hijack: bob silently drops a forged init (sender static + /// is garbage), so no matching response ever flows, so + /// the candidate never gains the right to confirm. + bool candidate_partner_responded = false; + /// Steady-clock ns of the most recent retry-init forward + /// from the same source. Used to rate-limit retry forwards + /// while the candidate is unconfirmed: a forger with the + /// public mac1 key can craft unlimited valid type-1 inits, + /// so the no-op-forward branch would otherwise let them + /// bounce arbitrary packets at the partner via the relay. + /// wg.ko's normal retry cadence is 5 s, so capping at one + /// forward per second is conservative for legit clients + /// while sharply limiting amplifier abuse. + uint64_t candidate_last_forward_ns = 0; + /// Steady-clock ns of the last completed relearn — gates + /// new candidate registrations against rapid flapping. + uint64_t last_relearn_ns = 0; }; /// One operator-declared forwarding link between two @@ -83,6 +127,28 @@ struct WgRelayStats { std::atomic fwd_packets{0}; std::atomic drop_unknown_src{0}; std::atomic drop_no_link{0}; + /// First-byte / length sanity check — drops packets whose + /// shape doesn't match a WireGuard message type. Mirrors the + /// XDP STAT_DROP_NOT_WG_SHAPED counter for the userspace + /// fallback path. + std::atomic drop_not_wg_shaped{0}; + /// Handshake init/response from a registered source whose + /// MAC1 field doesn't verify against the link partner's + /// stamped pubkey. Engages only when the operator has + /// stamped pubkeys on both ends of a link. A non-zero count + /// usually means a misconfigured client, a NAT collision, + /// or someone pointed at the wrong relay. + std::atomic drop_handshake_pubkey_mismatch{0}; + /// Handshake init/response from an unknown source whose + /// MAC1 didn't verify against any registered partner's + /// pubkey — i.e. wasn't a roam attempt for any known peer. + std::atomic drop_handshake_no_pubkey_match{0}; + /// Candidate slot expired without transport data confirming + /// it. Strong signal of a forged handshake — the source + /// could produce a valid MAC1 but couldn't progress to + /// transport data because they don't have the static + /// private key. + std::atomic drop_relearn_unconfirmed{0}; }; /// One attached NIC. The same BPF program is attached to @@ -115,6 +181,10 @@ struct WgXdpCtx { /// into per-peer rx_bytes/fwd_bytes in `wg peer list` so /// the operator sees XDP-path traffic alongside userspace. int peer_bytes_map_fd = -1; + /// Source-IP blocklist (HASH key=u32 IPv4 NBO, value= + /// blocklist_entry). Userspace writes; BPF reads + drops + /// on every packet from a live blocklisted source. + int blocklist_map_fd = -1; /// Devmap (key = ifindex, value = ifindex) used for /// cross-NIC redirect. Populated at attach with one /// entry per attachment's ifindex. @@ -140,6 +210,18 @@ struct WgXdpStats { uint64_t fwd_xdp = 0; uint64_t pass_no_peer = 0; uint64_t pass_no_mac = 0; + uint64_t drop_not_wg_shaped = 0; + uint64_t drop_blocklisted = 0; +}; + +/// Strike record per source IP — incremented when a candidate +/// endpoint that source registered fails to confirm via +/// transport-data. Escalates the source onto the blocklist +/// after a threshold is crossed. +struct WgRelayStrike { + uint32_t count = 0; + uint64_t first_strike_ns = 0; + uint64_t total_strikes = 0; }; struct WgRelay { @@ -147,11 +229,19 @@ struct WgRelay { uint16_t port = 0; std::vector peers; std::vector links; - /// peers_mu guards peers + links + roster_path writes. - /// All operator-side mutations and the recv loop's - /// per-packet lookups serialize on it; the lookup is - /// O(N) but N is small (operator-supplied peer roster). + /// peers_mu guards peers + links + roster_path writes + /// AND the strike + blocklist tables below. mutable std::mutex peers_mu; + /// Failed-confirm strikes by source IP (host-byte-order + /// uint32_t). Cleared from a peer's record once a confirm + /// succeeds; escalated to wg_blocklist once the threshold + /// is crossed. + std::map strikes; + /// Blocked source IPs (host-byte-order uint32_t) → expiry + /// timestamp (steady_clock ns). Mirrors the BPF + /// wg_blocklist map for `wg blocklist list` / userspace + /// drop in case XDP isn't attached. + std::map blocklist; WgRelayStats stats; std::atomic running{false}; std::thread loop_thread; @@ -225,6 +315,10 @@ struct WgRelayStatsSnapshot { uint64_t fwd_packets; uint64_t drop_unknown_src; uint64_t drop_no_link; + uint64_t drop_not_wg_shaped; + uint64_t drop_handshake_pubkey_mismatch; + uint64_t drop_handshake_no_pubkey_match; + uint64_t drop_relearn_unconfirmed; size_t peer_count; size_t link_count; /// XDP-path counters, zero when xdp.attached is false. @@ -233,6 +327,14 @@ struct WgRelayStatsSnapshot { }; WgRelayStatsSnapshot WgRelayGetStats(const WgRelay* r); +struct WgBlocklistView { + std::string ip; // dotted-quad IPv4 + uint64_t seconds_left; // until expiry + uint64_t total_strikes; // cumulative for this IP +}; +std::vector WgRelayListBlocklist( + const WgRelay* r); + } // namespace hyper_derp #endif // INCLUDE_HYPER_DERP_WG_RELAY_H_ diff --git a/src/crypto/blake2s.cc b/src/crypto/blake2s.cc new file mode 100644 index 0000000..e5aacd1 --- /dev/null +++ b/src/crypto/blake2s.cc @@ -0,0 +1,135 @@ +/// @file blake2s.cc +/// @brief RFC 7693 Blake2s reference implementation, single-shot. +// Copyright (c) 2026 Hyper-DERP contributors + +#include "blake2s.h" + +#include + +namespace hyper_derp::crypto { +namespace { + +constexpr uint32_t kIV[8] = { + 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL, +}; + +constexpr uint8_t kSigma[10][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, +}; + +inline uint32_t Load32LE(const uint8_t* p) { + return uint32_t{p[0]} | (uint32_t{p[1]} << 8) | + (uint32_t{p[2]} << 16) | (uint32_t{p[3]} << 24); +} + +inline void Store32LE(uint8_t* p, uint32_t v) { + p[0] = static_cast(v); + p[1] = static_cast(v >> 8); + p[2] = static_cast(v >> 16); + p[3] = static_cast(v >> 24); +} + +inline uint32_t Rotr(uint32_t x, int n) { + return (x >> n) | (x << (32 - n)); +} + +inline void G(uint32_t v[16], int a, int b, int c, int d, + uint32_t x, uint32_t y) { + v[a] = v[a] + v[b] + x; + v[d] = Rotr(v[d] ^ v[a], 16); + v[c] = v[c] + v[d]; + v[b] = Rotr(v[b] ^ v[c], 12); + v[a] = v[a] + v[b] + y; + v[d] = Rotr(v[d] ^ v[a], 8); + v[c] = v[c] + v[d]; + v[b] = Rotr(v[b] ^ v[c], 7); +} + +void Compress(uint32_t h[8], const uint8_t block[64], uint64_t t, + bool last) { + uint32_t m[16]; + for (int i = 0; i < 16; ++i) { + m[i] = Load32LE(block + i * 4); + } + uint32_t v[16]; + for (int i = 0; i < 8; ++i) v[i] = h[i]; + for (int i = 0; i < 8; ++i) v[i + 8] = kIV[i]; + v[12] ^= static_cast(t); + v[13] ^= static_cast(t >> 32); + if (last) v[14] = ~v[14]; + for (int r = 0; r < 10; ++r) { + const uint8_t* s = kSigma[r]; + G(v, 0, 4, 8, 12, m[s[0]], m[s[1]]); + G(v, 1, 5, 9, 13, m[s[2]], m[s[3]]); + G(v, 2, 6, 10, 14, m[s[4]], m[s[5]]); + G(v, 3, 7, 11, 15, m[s[6]], m[s[7]]); + G(v, 0, 5, 10, 15, m[s[8]], m[s[9]]); + G(v, 1, 6, 11, 12, m[s[10]], m[s[11]]); + G(v, 2, 7, 8, 13, m[s[12]], m[s[13]]); + G(v, 3, 4, 9, 14, m[s[14]], m[s[15]]); + } + for (int i = 0; i < 8; ++i) h[i] ^= v[i] ^ v[i + 8]; +} + +} // namespace + +void Blake2s(uint8_t* out, size_t out_len, const uint8_t* key, + size_t key_len, const uint8_t* in, size_t in_len) { + uint32_t h[8]; + for (int i = 0; i < 8; ++i) h[i] = kIV[i]; + // Parameter block: digest_length || key_length || fanout || depth. + h[0] ^= 0x01010000UL ^ (static_cast(key_len) << 8) ^ + static_cast(out_len); + + uint8_t block[64]; + uint64_t t = 0; + size_t cursor = 0; + bool finalized = false; + + // If keyed, the first block is the key padded with zeros and + // counts as 64 bytes consumed. When there's no message the + // keyed block is also the only (and final) block. + if (key_len > 0) { + std::memset(block, 0, sizeof(block)); + std::memcpy(block, key, key_len); + if (in_len == 0) { + Compress(h, block, 64, true); + finalized = true; + } else { + Compress(h, block, 64, false); + t = 64; + } + } + + if (!finalized) { + // Bulk: full 64-byte blocks except possibly the last. + while (in_len - cursor > 64) { + std::memcpy(block, in + cursor, 64); + cursor += 64; + t += 64; + Compress(h, block, t, false); + } + // Final block (possibly short, possibly zero remaining). + std::memset(block, 0, sizeof(block)); + size_t last = in_len - cursor; + std::memcpy(block, in + cursor, last); + t += last; + Compress(h, block, t, true); + } + + uint8_t buf[32]; + for (int i = 0; i < 8; ++i) Store32LE(buf + i * 4, h[i]); + std::memcpy(out, buf, out_len); +} + +} // namespace hyper_derp::crypto diff --git a/src/crypto/blake2s.h b/src/crypto/blake2s.h new file mode 100644 index 0000000..e007b18 --- /dev/null +++ b/src/crypto/blake2s.h @@ -0,0 +1,25 @@ +/// @file blake2s.h +/// @brief Blake2s hash used by WireGuard's MAC1 derivation. +/// +/// Self-contained because libsodium ships Blake2b, not the +/// 32-bit Blake2s variant WireGuard's protocol uses. This is a +/// straight port of the RFC 7693 reference, scoped to what +/// wg-relay needs (single-shot, ≤32 byte output, ≤32 byte +/// key, no streaming API). +// Copyright (c) 2026 Hyper-DERP contributors + +#pragma once + +#include +#include + +namespace hyper_derp::crypto { + +/// One-shot keyed Blake2s. +/// * out_len: 1..32 +/// * key_len: 0..32 (0 = unkeyed) +/// Reference: RFC 7693 §3.2. +void Blake2s(uint8_t* out, size_t out_len, const uint8_t* key, + size_t key_len, const uint8_t* in, size_t in_len); + +} // namespace hyper_derp::crypto diff --git a/src/einheit_channel.cc b/src/einheit_channel.cc index daf6bdd..564c734 100644 --- a/src/einheit_channel.cc +++ b/src/einheit_channel.cc @@ -1928,6 +1928,26 @@ void WgShowConfig(Server* s, const Request& req, SetBody(r, body); } +void WgBlocklistList(Server* s, const Request& /*req*/, + Response* r) { + if (!WgGate(s, r)) return; + auto entries = WgRelayListBlocklist(s->wg_relay); + if (entries.empty()) { + SetBody(r, "blocklist=empty\n"); + return; + } + std::string b; + for (size_t i = 0; i < entries.size(); ++i) { + b += std::format("entry.{}.ip={}\n", i, entries[i].ip); + b += std::format("entry.{}.seconds_left={}\n", i, + entries[i].seconds_left); + b += std::format("entry.{}.total_strikes={}\n", i, + entries[i].total_strikes); + } + b += std::format("count={}\n", entries.size()); + SetBody(r, b); +} + void WgShow(Server* s, const Request& /*req*/, Response* r) { if (!WgGate(s, r)) return; @@ -1941,6 +1961,20 @@ void WgShow(Server* s, const Request& /*req*/, b += std::format("drop_unknown_src={}\n", stats.drop_unknown_src); b += std::format("drop_no_link={}\n", stats.drop_no_link); + // Aggregate non-WG-shaped drops across userspace + XDP so + // the operator sees one number regardless of which path + // the bytes took. + uint64_t shape_total = stats.drop_not_wg_shaped; + if (stats.xdp_attached) { + shape_total += stats.xdp.drop_not_wg_shaped; + } + b += std::format("drop_not_wg_shaped={}\n", shape_total); + b += std::format("drop_handshake_pubkey_mismatch={}\n", + stats.drop_handshake_pubkey_mismatch); + b += std::format("drop_handshake_no_pubkey_match={}\n", + stats.drop_handshake_no_pubkey_match); + b += std::format("drop_relearn_unconfirmed={}\n", + stats.drop_relearn_unconfirmed); b += std::format("xdp_attached={}\n", stats.xdp_attached ? "true" : "false"); if (stats.xdp_attached) { @@ -1952,6 +1986,8 @@ void WgShow(Server* s, const Request& /*req*/, stats.xdp.pass_no_peer); b += std::format("xdp_pass_no_mac={}\n", stats.xdp.pass_no_mac); + b += std::format("xdp_drop_blocklisted={}\n", + stats.xdp.drop_blocklisted); } SetBody(r, b); } @@ -2179,6 +2215,11 @@ Registry MakeRegistry() { m["wg_show"] = {WgShow, Role::kAny, "wg show", "Aggregate counters + roster summary", false, {}}; + m["wg_blocklist_list"] = { + WgBlocklistList, Role::kAny, "wg blocklist list", + "Source IPs auto-blocked after repeated failed-confirm " + "strikes (forged-handshake protection)", + false, {}}; return m; } diff --git a/src/wg_relay.cc b/src/wg_relay.cc index a37f7bf..1dc9345 100644 --- a/src/wg_relay.cc +++ b/src/wg_relay.cc @@ -3,6 +3,8 @@ #include "hyper_derp/wg_relay.h" +#include "crypto/blake2s.h" + #include #include #include @@ -66,6 +68,46 @@ bool SockaddrEqual(const sockaddr_storage& a, socklen_t la, return std::memcmp(&a, &b, la) == 0; } +// Render an IPv4 sockaddr_storage as "host:port" for human- +// facing output (operator logs, roster file). Falls back to +// "?" if the family or length is something we can't render. +std::string FormatEndpoint(const sockaddr_storage& ss, + socklen_t len) { + if (ss.ss_family == AF_INET && + len >= static_cast(sizeof(sockaddr_in))) { + const auto* sin = + reinterpret_cast(&ss); + char buf[INET_ADDRSTRLEN]; + if (inet_ntop(AF_INET, &sin->sin_addr, buf, sizeof(buf))) { + return std::string(buf) + ":" + + std::to_string(ntohs(sin->sin_port)); + } + } + if (ss.ss_family == AF_INET6 && + len >= static_cast(sizeof(sockaddr_in6))) { + const auto* sin6 = + reinterpret_cast(&ss); + // V4-mapped → render as plain v4 for operator clarity. + static const uint8_t kV4Prefix[12] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff}; + if (std::memcmp(sin6->sin6_addr.s6_addr, kV4Prefix, 12) == + 0) { + char buf[INET_ADDRSTRLEN]; + if (inet_ntop(AF_INET, sin6->sin6_addr.s6_addr + 12, buf, + sizeof(buf))) { + return std::string(buf) + ":" + + std::to_string(ntohs(sin6->sin6_port)); + } + } + char buf[INET6_ADDRSTRLEN]; + if (inet_ntop(AF_INET6, &sin6->sin6_addr, buf, sizeof(buf))) { + return std::string("[") + buf + "]:" + + std::to_string(ntohs(sin6->sin6_port)); + } + } + return "?"; +} + // Compare against an IPv4 endpoint when the incoming // packet arrived as v4-mapped over an AF_INET6 socket. bool EndpointMatches(const WgRelayPeer& p, @@ -355,16 +397,567 @@ void PersistRosterLocked(WgRelay* r) { // -- Forward path -------------------------------------- +// Roaming-candidate timing constants. Defaults match the +// numbers in docs/design/wg_relay_pubkey_filter.md. +// * Candidate slot lives 30 s waiting for transport-data +// confirmation. Expiry → drop_relearn_unconfirmed. +// * Cooldown of 5 s between successive candidate +// registrations for the same peer (rate-limits flap). +constexpr uint64_t kCandidateTimeoutNs = 30ULL * 1'000'000'000ULL; +constexpr uint64_t kRelearnCooldownNs = 5ULL * 1'000'000'000ULL; +// Rate-limit retry-init forwards from a not-yet-confirmed +// candidate. wg.ko's retry cadence is 5 s; legit clients only +// trip this when their network is genuinely flaky, while a +// forger spamming at line rate gets clamped to ~1 pps. +constexpr uint64_t kRetryForwardGapNs = 1ULL * 1'000'000'000ULL; + +// Blocklist escalation policy. Each strike records a failed +// candidate confirmation from this source IP; once the count +// crosses a threshold within the matching window, the source +// gets blocklisted for the listed duration. See +// docs/design/wg_relay_pubkey_filter.md for the rationale. +struct StrikePolicy { + uint32_t strikes; + uint64_t window_ns; + uint64_t block_ns; +}; +constexpr StrikePolicy kStrikePolicy[] = { + {2, 60ULL * 1'000'000'000ULL, + 60ULL * 1'000'000'000ULL}, + {5, 3600ULL * 1'000'000'000ULL, + 3600ULL * 1'000'000'000ULL}, + {10, 86400ULL * 1'000'000'000ULL, + 86400ULL * 1'000'000'000ULL}, +}; + +// WireGuard MAC1 derivation. The MAC1 key for a peer is +// Blake2s(LABEL_MAC1 || peer_static_pubkey) — independent of +// the message, derived once per peer pubkey. The MAC1 itself +// is Blake2s_keyed(mac1_key, msg[0..len-32], outlen=16) and +// occupies bytes [len-32 .. len-16] of an init/response. +constexpr char kLabelMac1[] = "mac1----"; + +// Decode a 32-byte WireGuard pubkey from its base64 form. +// Returns false on malformed input. +bool DecodeWgPubkey(const std::string& b64, + std::array* out) { + if (b64.size() < 43) return false; // 32 bytes → 44 chars + auto val = [](char c) -> int { + if (c >= 'A' && c <= 'Z') return c - 'A'; + if (c >= 'a' && c <= 'z') return c - 'a' + 26; + if (c >= '0' && c <= '9') return c - '0' + 52; + if (c == '+') return 62; + if (c == '/') return 63; + return -1; + }; + int held = 0; + int bits = 0; + size_t outc = 0; + for (char c : b64) { + if (c == '=') break; + if (c == '\n' || c == '\r' || c == ' ') continue; + int v = val(c); + if (v < 0) return false; + held = (held << 6) | v; + bits += 6; + if (bits >= 8) { + bits -= 8; + if (outc >= 32) return false; + (*out)[outc++] = + static_cast((held >> bits) & 0xFF); + } + } + return outc == 32; +} + +// Compute the 32-byte MAC1 key from the peer's static pubkey. +// Cheap (one Blake2s of 40 bytes); we recompute per packet +// rather than caching — handshakes are rare. +void DeriveMac1Key(const uint8_t pubkey[32], + uint8_t mac1_key[32]) { + uint8_t input[8 + 32]; + std::memcpy(input, kLabelMac1, 8); + std::memcpy(input + 8, pubkey, 32); + hyper_derp::crypto::Blake2s(mac1_key, 32, nullptr, 0, input, + sizeof(input)); +} + +// Verify the MAC1 field on a WG handshake init/response. +// Returns true if MAC1 matches the expected value derived +// from `partner_pubkey` (the responder's static key, which is +// what the *destination* of the forwarded packet is). +bool VerifyMac1(const uint8_t* pkt, size_t len, + const uint8_t partner_pubkey[32]) { + // MAC1 lives 32 bytes from the end (mac1 16 + mac2 16), + // computed over msg[0..len-32]. + if (len < 32) return false; + uint8_t mac1_key[32]; + DeriveMac1Key(partner_pubkey, mac1_key); + uint8_t got[16]; + hyper_derp::crypto::Blake2s(got, 16, mac1_key, 32, pkt, + len - 32); + // Constant-time compare. 16 bytes — small enough for a + // straight loop. + uint8_t diff = 0; + for (size_t i = 0; i < 16; ++i) { + diff |= got[i] ^ pkt[len - 32 + i]; + } + return diff == 0; +} + +// Return true if `pkt` looks like a valid WireGuard +// message: first byte is one of the four message types +// (1 init, 2 response, 3 cookie, 4 transport) and the +// length matches the type's expected size. Transport +// data has variable length; we only enforce a minimum. +// Mirrors the BPF-side check in bpf/wg_relay.bpf.c. +bool IsWgShaped(const uint8_t* pkt, size_t len) { + if (len < 1) return false; + switch (pkt[0]) { + case 1: return len == 148; // handshake init + case 2: return len == 92; // handshake response + case 3: return len == 64; // cookie reply + case 4: return len >= 32; // transport data + default: return false; + } +} + +// Pull the IPv4 source out of an arbitrary sockaddr_storage +// into host byte order. Returns 0 on non-v4 / unmappable. +uint32_t ExtractV4SrcHostOrder(const sockaddr_storage& ss, + socklen_t len) { + if (ss.ss_family == AF_INET && + len >= static_cast(sizeof(sockaddr_in))) { + const auto* sin = + reinterpret_cast(&ss); + return ntohl(sin->sin_addr.s_addr); + } + if (ss.ss_family == AF_INET6 && + len >= static_cast(sizeof(sockaddr_in6))) { + const auto* sin6 = + reinterpret_cast(&ss); + static const uint8_t kV4Prefix[12] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff}; + if (std::memcmp(sin6->sin6_addr.s6_addr, kV4Prefix, 12) + == 0) { + uint32_t a = 0; + std::memcpy(&a, sin6->sin6_addr.s6_addr + 12, 4); + return ntohl(a); + } + } + return 0; +} + +// Push the host-order IP + expiry into the BPF blocklist +// map. Best-effort — XDP not attached or bpf write failure +// just leaves userspace as the only enforcer. +void XdpBlocklistInsert(WgRelay* r, uint32_t host_ip, + uint64_t expiry_ns) { + if (!r->xdp.attached || r->xdp.blocklist_map_fd < 0) return; + uint32_t key = htonl(host_ip); + struct { + uint64_t expiry_ns; + } val{expiry_ns}; + bpf_map_update_elem(r->xdp.blocklist_map_fd, &key, &val, + BPF_ANY); +} + +void XdpBlocklistDelete(WgRelay* r, uint32_t host_ip) { + if (!r->xdp.attached || r->xdp.blocklist_map_fd < 0) return; + uint32_t key = htonl(host_ip); + bpf_map_delete_elem(r->xdp.blocklist_map_fd, &key); +} + +// Record a failed-confirm strike for `src` and escalate to +// the blocklist if the threshold is crossed. Caller holds +// peers_mu. +void RecordStrikeLocked(WgRelay* r, + const sockaddr_storage& src, + socklen_t src_len) { + uint32_t ip_h = ExtractV4SrcHostOrder(src, src_len); + if (ip_h == 0) return; // not v4-representable + uint64_t now = NowNs(); + WgRelayStrike& s = r->strikes[ip_h]; + s.total_strikes += 1; + + // Find the policy whose window starts past the first + // strike — i.e. the recent-most window that's still open. + // Reset the per-window count if we're outside the most + // generous window. + const StrikePolicy* widest = &kStrikePolicy[std::size(kStrikePolicy) - 1]; + if (s.first_strike_ns == 0 || + now - s.first_strike_ns > widest->window_ns) { + s.count = 0; + s.first_strike_ns = now; + } + s.count += 1; + + // Walk policies from strictest to most generous; the + // first one whose window contains all strikes AND whose + // strike count is met escalates. + for (const auto& pol : kStrikePolicy) { + if (s.count >= pol.strikes && + now - s.first_strike_ns <= pol.window_ns) { + uint64_t expiry = now + pol.block_ns; + auto [it, inserted] = + r->blocklist.insert_or_assign(ip_h, expiry); + XdpBlocklistInsert(r, ip_h, expiry); + if (inserted) { + char buf[INET_ADDRSTRLEN]; + uint32_t nbo = htonl(ip_h); + inet_ntop(AF_INET, &nbo, buf, sizeof(buf)); + spdlog::warn( + "wg-relay blocklist {}: {} strikes in window " + "→ blocked for {}s", + buf, s.count, pol.block_ns / 1'000'000'000ULL); + } + // Reset per-window counter so the next round starts + // fresh after this block expires. + s.count = 0; + s.first_strike_ns = 0; + return; + } + } +} + +// Sweep peers and clear any candidate slot that's been open +// longer than kCandidateTimeoutNs. drop_relearn_unconfirmed +// ticks once per expiry — strong signal that a forged +// handshake came in but the source couldn't progress to +// transport data. Caller holds peers_mu. +void ExpireCandidatesLocked(WgRelay* r) { + uint64_t now = NowNs(); + for (auto& p : r->peers) { + if (p.candidate_endpoint_len == 0) continue; + if (now - p.candidate_set_ns > kCandidateTimeoutNs) { + // Record a strike on the source IP before dropping + // the candidate state; if this source has been + // wasting candidate slots repeatedly it'll escalate + // onto the blocklist and stop reaching the relay + // entirely. + RecordStrikeLocked(r, p.candidate_endpoint, + p.candidate_endpoint_len); + p.candidate_endpoint_len = 0; + p.candidate_set_ns = 0; + p.candidate_last_forward_ns = 0; + p.candidate_init_sender_index = 0; + p.candidate_partner_responded = false; + r->stats.drop_relearn_unconfirmed.fetch_add( + 1, std::memory_order_relaxed); + } + } + // Sweep stale strike entries — a strike whose first strike + // is older than the widest policy window can no longer fire + // anything, so keeping it in the map is pure memory growth. + // Without this sweep, a forger spraying from spoofed source + // IPs (each striking once and never returning) would grow + // the strike map unbounded. + uint64_t widest_window = + kStrikePolicy[std::size(kStrikePolicy) - 1].window_ns; + for (auto it = r->strikes.begin(); + it != r->strikes.end();) { + if (it->second.first_strike_ns != 0 && + now - it->second.first_strike_ns > widest_window) { + it = r->strikes.erase(it); + } else { + ++it; + } + } + // Sweep expired blocklist entries — stale-but-expired + // entries are harmless (BPF compares against current + // monotonic time) but cleaning them keeps `wg blocklist + // list` honest and bounds the BPF map size. + for (auto it = r->blocklist.begin(); + it != r->blocklist.end();) { + if (it->second <= now) { + XdpBlocklistDelete(r, it->first); + it = r->blocklist.erase(it); + } else { + ++it; + } + } +} + +// Find the link partner of the peer named `name`, if any. +// Caller holds peers_mu. +WgRelayPeer* FindLinkPartnerLocked(WgRelay* r, + const std::string& name) { + for (const auto& l : r->links) { + if (l.a == name) { + for (auto& p : r->peers) { + if (p.name == l.b) return &p; + } + } else if (l.b == name) { + for (auto& p : r->peers) { + if (p.name == l.a) return &p; + } + } + } + return nullptr; +} + +// Handle a handshake init/response that arrived from a source +// with no registered peer match. Try MAC1 against every +// registered peer's pubkey: the peer whose pubkey matches is +// the destination, and the actual sender is that peer's link +// partner. Register a candidate endpoint on the sender (the +// committed endpoint stays unchanged until transport data +// confirms) and forward the handshake to the destination. +// +// Caller holds peers_mu. Always increments either +// drop_handshake_no_pubkey_match (no match) or fwd_packets +// (candidate registered + forwarded). +void HandleUnknownSrcHandshakeLocked( + WgRelay* r, const uint8_t* pkt, size_t len, + const sockaddr_storage& src, socklen_t src_len) { + // Type 2 (response) from an unknown source has no place in + // the protocol: legitimate responses come from the + // committed responder endpoint and hit the regular forward + // path. Accepting them here would give a forger a free + // unauthenticated amplifier — register once, then bounce + // any number of WG-shaped packets at the partner via the + // same-source no-op-forward branch. Drop outright. + if (pkt[0] != 1) { + r->stats.drop_unknown_src.fetch_add( + 1, std::memory_order_relaxed); + return; + } + uint64_t now = NowNs(); + for (auto& p : r->peers) { + if (p.pubkey_b64.empty()) continue; + std::array p_pub; + if (!DecodeWgPubkey(p.pubkey_b64, &p_pub)) continue; + if (!VerifyMac1(pkt, len, p_pub.data())) continue; + + // p is the destination. Sender = p's link partner. + WgRelayPeer* sender = FindLinkPartnerLocked(r, p.name); + if (!sender) continue; + + // Cooldown: refuse if sender's last relearn was very + // recent. Treats the request as drop_unknown_src so an + // attacker can't infer cooldown from a separate counter. + if (sender->last_relearn_ns && + now - sender->last_relearn_ns < kRelearnCooldownNs) { + r->stats.drop_unknown_src.fetch_add( + 1, std::memory_order_relaxed); + return; + } + + // Already-pending candidate gates: a back-to-back + // handshake from the SAME source would otherwise refresh + // the candidate's set_ns and the slot would never expire, + // letting an attacker pin the slot indefinitely. Treat + // that as a no-op forward — the existing candidate is + // still valid. We DO refresh candidate_init_sender_index + // (each retry from wg.ko picks a fresh sender_index, and + // we want the partner-response match to track the latest + // init since that's the one bob's wg.ko responds to). + if (sender->candidate_endpoint_len > 0 && + SockaddrEqual(sender->candidate_endpoint, + sender->candidate_endpoint_len, src, + src_len)) { + if (pkt[0] == 1 && len >= 8) { + uint32_t idx; + std::memcpy(&idx, pkt + 4, 4); + sender->candidate_init_sender_index = idx; + } + // Rate-limit retry forwards. Legit wg.ko retries every + // 5 s; a forger spamming retries to abuse the relay as + // an amplifier gets clamped to ~1 pps. Drops above the + // gap count as drop_unknown_src so the forger can't + // distinguish "you're rate-limited" from "your packet + // was malformed." + if (now - sender->candidate_last_forward_ns + < kRetryForwardGapNs) { + r->stats.drop_unknown_src.fetch_add( + 1, std::memory_order_relaxed); + return; + } + sender->candidate_last_forward_ns = now; + ssize_t sent = sendto( + r->sock_fd, pkt, len, 0, + reinterpret_cast(&p.endpoint), + p.endpoint_len); + if (sent > 0) { + r->stats.fwd_packets.fetch_add( + 1, std::memory_order_relaxed); + } + return; + } + + // A different source is contesting an existing candidate + // — drop it. The current candidate keeps its expiry + // timer; if it doesn't confirm, the strike + blocklist + // path will catch the attacker. + if (sender->candidate_endpoint_len > 0) { + r->stats.drop_unknown_src.fetch_add( + 1, std::memory_order_relaxed); + return; + } + + // Register the candidate. Committed endpoint stays put. + std::memcpy(&sender->candidate_endpoint, &src, src_len); + sender->candidate_endpoint_len = src_len; + sender->candidate_set_ns = now; + sender->candidate_last_forward_ns = now; + sender->candidate_partner_responded = false; + // Init sender_index lives at bytes 4..8 of the type-1 + // packet (little-endian per the WG spec). We use it to + // match against the partner's response receiver_index + // before letting transport-data confirm the candidate. + uint32_t idx; + std::memcpy(&idx, pkt + 4, 4); + sender->candidate_init_sender_index = idx; + + // Forward the handshake to the destination. + ssize_t sent = sendto( + r->sock_fd, pkt, len, 0, + reinterpret_cast(&p.endpoint), + p.endpoint_len); + if (sent < 0) { + spdlog::warn("wg-relay candidate sendto: {}", + std::strerror(errno)); + return; + } + r->stats.fwd_packets.fetch_add( + 1, std::memory_order_relaxed); + spdlog::info( + "wg-relay candidate registered: {} <- {} (relearn " + "for partner {})", + sender->name, FormatEndpoint(src, src_len), p.name); + return; + } + + // No partner pubkey verified. + r->stats.drop_handshake_no_pubkey_match.fetch_add( + 1, std::memory_order_relaxed); +} + +// Forward-declare the XDP map updater — implementation lives +// further down with the rest of the BPF housekeeping. +void XdpInsertLinkByNameLocked(WgRelay* r, + const std::string& a, + const std::string& b); +void XdpRemoveLinkByNameLocked(WgRelay* r, + const std::string& a, + const std::string& b); + +// Match `src` against any registered peer's candidate slot. +// On match, commit the candidate as the new endpoint, refresh +// the BPF map so XDP starts forwarding via the new endpoint, +// persist the roster, and return the now-registered peer. +// Caller continues the existing forward path with this peer +// as the source. Returns nullptr if no candidate matched. +// +// Caller holds peers_mu. +WgRelayPeer* ConfirmCandidateLocked( + WgRelay* r, const sockaddr_storage& src, + socklen_t src_len) { + uint64_t now = NowNs(); + for (auto& p : r->peers) { + if (p.candidate_endpoint_len == 0) continue; + if (!SockaddrEqual(p.candidate_endpoint, + p.candidate_endpoint_len, src, + src_len)) { + continue; + } + // Refuse to confirm until we've forwarded a partner + // response whose receiver_index matched the candidate's + // init sender_index. Without this, an attacker who + // knows the partner's pubkey could send a forged type-1 + // followed by any 32-byte UDP starting with 0x04 and + // hijack the slot — bob never responded, but transport- + // shape was enough. + if (!p.candidate_partner_responded) { + // Don't confirm; let the candidate expire normally. + // Keep iterating in case a different peer's candidate + // matches this src. + continue; + } + // Find p's link partner so we can rewrite both + // directions of the BPF link in one shot. + WgRelayPeer* partner = FindLinkPartnerLocked(r, p.name); + + // Drop the OLD BPF entries first — they were keyed on + // p's pre-relearn endpoint. After the in-memory swap + // we re-insert with the new key. + if (partner) { + XdpRemoveLinkByNameLocked(r, p.name, partner->name); + } + + // Confirm: commit the candidate as the live endpoint. + std::memcpy(&p.endpoint, &p.candidate_endpoint, + p.candidate_endpoint_len); + p.endpoint_len = p.candidate_endpoint_len; + p.endpoint_str = FormatEndpoint(p.candidate_endpoint, + p.candidate_endpoint_len); + p.candidate_endpoint_len = 0; + p.candidate_set_ns = 0; + p.candidate_last_forward_ns = 0; + p.candidate_init_sender_index = 0; + p.candidate_partner_responded = false; + p.last_relearn_ns = now; + p.endpoint_relearn += 1; + + // Re-insert with the new endpoint so XDP starts + // forwarding via the fast path again. + if (partner) { + XdpInsertLinkByNameLocked(r, p.name, partner->name); + } + + PersistRosterLocked(r); + // Clear any prior failed-confirm strikes for this + // source IP — the source has now demonstrated it + // controls a real WG session and is therefore a + // legitimate roamer, not a forger. + uint32_t ip_h = ExtractV4SrcHostOrder(src, src_len); + if (ip_h != 0) r->strikes.erase(ip_h); + + spdlog::info("wg-relay relearn {}: endpoint -> {}", + p.name, p.endpoint_str); + return &p; + } + return nullptr; +} + void HandlePacket(WgRelay* r, const uint8_t* pkt, size_t len, const sockaddr_storage& src, socklen_t src_len) { r->stats.rx_packets.fetch_add(1, std::memory_order_relaxed); + // Userspace-side blocklist check. XDP drops at the top + // of the program, so this only kicks in on the cold + // boot before XDP attaches or when XDP's missing. + { + uint32_t ip_h = ExtractV4SrcHostOrder(src, src_len); + if (ip_h != 0) { + std::lock_guard lk(r->peers_mu); + auto it = r->blocklist.find(ip_h); + if (it != r->blocklist.end() && it->second > NowNs()) { + return; + } + } + } + // Shape filter — drop anything that isn't a WireGuard + // message (types 1..4) with the right length. Keeps + // operator counters honest about non-WG noise hitting + // the relay's port and stops us forwarding garbage to + // the partner. Mirrors the BPF STAT_DROP_NOT_WG_SHAPED + // path so userspace and XDP agree on what gets dropped. + if (!IsWgShaped(pkt, len)) { + r->stats.drop_not_wg_shaped.fetch_add( + 1, std::memory_order_relaxed); + return; + } // Lookup is O(N) over the peer table; N is operator- // small (dozens), and the lock window covers the // sendto so the table can't change underfoot. std::lock_guard lk(r->peers_mu); + // Sweep stale candidate slots before deciding what to do + // with this packet — keeps drop_relearn_unconfirmed + // accurate without a separate timer thread. + ExpireCandidatesLocked(r); WgRelayPeer* src_peer = nullptr; for (auto& p : r->peers) { if (EndpointMatches(p, src, src_len)) { @@ -373,9 +966,31 @@ void HandlePacket(WgRelay* r, const uint8_t* pkt, } } if (!src_peer) { - r->stats.drop_unknown_src.fetch_add( - 1, std::memory_order_relaxed); - return; + // Source doesn't match any registered peer. There are + // three legitimate sub-cases handled here, plus drop: + // * Handshake init/response (1, 2): possibly a roam + // attempt — try MAC1 against every partner's pubkey. + // * Transport data (4): possibly the confirmation step + // for a previously-registered candidate. + // * Anything else (3 or non-WG already filtered): drop. + if (pkt[0] == 1 || pkt[0] == 2) { + HandleUnknownSrcHandshakeLocked(r, pkt, len, src, + src_len); + return; + } + if (pkt[0] == 4) { + src_peer = ConfirmCandidateLocked(r, src, src_len); + if (!src_peer) { + r->stats.drop_unknown_src.fetch_add( + 1, std::memory_order_relaxed); + return; + } + // src_peer is now the confirmed peer; fall through. + } else { + r->stats.drop_unknown_src.fetch_add( + 1, std::memory_order_relaxed); + return; + } } src_peer->rx_bytes += len; src_peer->last_seen_ns = NowNs(); @@ -407,6 +1022,28 @@ void HandlePacket(WgRelay* r, const uint8_t* pkt, return; } + // Optional MAC1 verification on handshake init/response. + // Engages only when the link partner has a stamped pubkey; + // the operator opts in by running `wg peer pubkey + // ` on both ends of a link. Catches packets from a + // registered source that aren't actually WG handshakes + // for the configured partner — e.g. NAT collisions, stale + // endpoint reuse, or someone pointed at the wrong relay. + if ((pkt[0] == 1 || pkt[0] == 2) && !dst->pubkey_b64.empty()) { + std::array partner_pub; + if (DecodeWgPubkey(dst->pubkey_b64, &partner_pub)) { + if (!VerifyMac1(pkt, len, partner_pub.data())) { + r->stats.drop_handshake_pubkey_mismatch.fetch_add( + 1, std::memory_order_relaxed); + return; + } + } + // If pubkey_b64 is set but doesn't decode, fall through — + // we'd rather forward a packet than drop one because of a + // bad operator-stamped key. The decode failure shows up at + // pubkey-set time anyway. + } + ssize_t sent = sendto( r->sock_fd, pkt, len, 0, reinterpret_cast(&dst->endpoint), @@ -416,6 +1053,42 @@ void HandlePacket(WgRelay* r, const uint8_t* pkt, std::strerror(errno)); return; } + // Handshake response → mirror to dst's pending candidate + // endpoint (so a real roamer at the new IP completes their + // handshake) and, if the response's receiver_index matches + // the candidate's stored init sender_index, mark the + // candidate as eligible to confirm via transport-data. + // The receiver_index check is the load-bearing one: it + // proves this response is FOR the candidate's init rather + // than for some concurrent legitimate handshake from the + // peer at the committed endpoint. Bytes 8..12 of the + // type-2 packet hold the response's receiver_index. + // Editing dst here is fine — caller holds peers_mu. + if (pkt[0] == 2 && dst->candidate_endpoint_len > 0) { + sendto(r->sock_fd, pkt, len, 0, + reinterpret_cast( + &dst->candidate_endpoint), + dst->candidate_endpoint_len); + if (len >= 12) { + uint32_t recv_idx; + std::memcpy(&recv_idx, pkt + 8, 4); + WgRelayPeer* mut_dst = nullptr; + for (auto& p : r->peers) { + if (p.name == dst->name) { + mut_dst = &p; + break; + } + } + if (mut_dst && + mut_dst->candidate_init_sender_index != 0 && + recv_idx == mut_dst->candidate_init_sender_index) { + mut_dst->candidate_partner_responded = true; + spdlog::info( + "wg-relay candidate {}: partner responded, " + "eligible to confirm", mut_dst->name); + } + } + } r->stats.fwd_packets.fetch_add( 1, std::memory_order_relaxed); src_peer->fwd_bytes += static_cast(sent); @@ -423,9 +1096,21 @@ void HandlePacket(WgRelay* r, const uint8_t* pkt, void RecvLoop(WgRelay* r) { std::vector buf(2048); + uint64_t last_sweep_ns = 0; + constexpr uint64_t kSweepIntervalNs = + 1ULL * 1'000'000'000ULL; // 1 s while (r->running.load(std::memory_order_acquire)) { pollfd pfd{r->sock_fd, POLLIN, 0}; int rc = poll(&pfd, 1, 200); + // Periodic sweep — runs whether or not a packet + // arrived, so candidate timeouts and expired blocklist + // entries fire on idle relays too. + uint64_t now = NowNs(); + if (now - last_sweep_ns >= kSweepIntervalNs) { + std::lock_guard lk(r->peers_mu); + ExpireCandidatesLocked(r); + last_sweep_ns = now; + } if (rc <= 0) continue; sockaddr_storage src{}; socklen_t src_len = sizeof(src); @@ -604,9 +1289,10 @@ bool XdpAttach(WgRelay* r, const std::string& iface_list, auto* devmap = get_map("wg_devmap"); auto* nic_macs_map = get_map("wg_nic_macs"); auto* nic_ips_map = get_map("wg_nic_ips"); + auto* blocklist_map = get_map("wg_blocklist"); if (!prog || !peers_map || !macs_map || !stats_map || !port_map || !peer_bytes_map || !devmap || - !nic_macs_map || !nic_ips_map) { + !nic_macs_map || !nic_ips_map || !blocklist_map) { spdlog::error("wg-relay xdp: program/maps not found " "in {}", bpf_obj_path); bpf_object__close(obj); @@ -621,6 +1307,7 @@ bool XdpAttach(WgRelay* r, const std::string& iface_list, int devmap_fd = bpf_map__fd(devmap); int nic_macs_fd = bpf_map__fd(nic_macs_map); int nic_ips_fd = bpf_map__fd(nic_ips_map); + int blocklist_fd = bpf_map__fd(blocklist_map); uint32_t key0 = 0; if (bpf_map_update_elem(port_fd, &key0, &port, BPF_ANY) < @@ -708,6 +1395,7 @@ bool XdpAttach(WgRelay* r, const std::string& iface_list, r->xdp.devmap_fd = devmap_fd; r->xdp.nic_macs_map_fd = nic_macs_fd; r->xdp.nic_ips_map_fd = nic_ips_fd; + r->xdp.blocklist_map_fd = blocklist_fd; r->xdp.attached = true; for (size_t i = 0; i < r->xdp.attachments.size(); ++i) { spdlog::info( @@ -847,8 +1535,10 @@ void XdpReadStats(const WgRelay* r, WgXdpStats* out) { auto vals = std::make_unique( static_cast(ncpus)); uint64_t* sums[] = {&out->rx_xdp, &out->fwd_xdp, - &out->pass_no_peer, &out->pass_no_mac}; - for (uint32_t k = 0; k < 4; ++k) { + &out->pass_no_peer, &out->pass_no_mac, + &out->drop_not_wg_shaped, + &out->drop_blocklisted}; + for (uint32_t k = 0; k < 6; ++k) { if (bpf_map_lookup_elem(r->xdp.stats_map_fd, &k, vals.get()) < 0) { continue; @@ -1207,6 +1897,17 @@ WgRelayStatsSnapshot WgRelayGetStats(const WgRelay* r) { std::memory_order_relaxed); s.drop_no_link = r->stats.drop_no_link.load( std::memory_order_relaxed); + s.drop_not_wg_shaped = r->stats.drop_not_wg_shaped.load( + std::memory_order_relaxed); + s.drop_handshake_pubkey_mismatch = + r->stats.drop_handshake_pubkey_mismatch.load( + std::memory_order_relaxed); + s.drop_handshake_no_pubkey_match = + r->stats.drop_handshake_no_pubkey_match.load( + std::memory_order_relaxed); + s.drop_relearn_unconfirmed = + r->stats.drop_relearn_unconfirmed.load( + std::memory_order_relaxed); s.xdp_attached = r->xdp.attached; XdpReadStats(r, &s.xdp); std::lock_guard lk(r->peers_mu); @@ -1215,4 +1916,25 @@ WgRelayStatsSnapshot WgRelayGetStats(const WgRelay* r) { return s; } +std::vector WgRelayListBlocklist( + const WgRelay* r) { + std::vector out; + std::lock_guard lk(r->peers_mu); + uint64_t now = NowNs(); + for (const auto& [ip_h, expiry] : r->blocklist) { + if (expiry <= now) continue; + char buf[INET_ADDRSTRLEN]; + uint32_t nbo = htonl(ip_h); + inet_ntop(AF_INET, &nbo, buf, sizeof(buf)); + WgBlocklistView v; + v.ip = buf; + v.seconds_left = (expiry - now) / 1'000'000'000ULL; + auto it = r->strikes.find(ip_h); + v.total_strikes = + (it != r->strikes.end()) ? it->second.total_strikes : 0; + out.push_back(std::move(v)); + } + return out; +} + } // namespace hyper_derp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 71c57d8..392e350 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -27,6 +27,10 @@ list(FILTER UNIT_TEST_SOURCES EXCLUDE REGEX "test_open_conn\\.cc") list(FILTER UNIT_TEST_SOURCES EXCLUDE REGEX "test_einheit_channel\\.cc") +# test_hd_blake2s links its own copy of src/crypto/blake2s.cc +# instead of libderp; keep it out of the auto-glob. +list(FILTER UNIT_TEST_SOURCES + EXCLUDE REGEX "test_hd_blake2s\\.cc") foreach(test_source ${UNIT_TEST_SOURCES}) get_filename_component(test_name ${test_source} NAME_WE) @@ -116,6 +120,18 @@ target_link_libraries(test_wg_config PRIVATE gtest_discover_tests(test_wg_config DISCOVERY_TIMEOUT 10 PROPERTIES TIMEOUT 30) +# Standalone Blake2s used for wg-relay MAC1. Compiles its +# own object so we don't drag libderp's full link surface in. +add_executable(test_hd_blake2s + test_hd_blake2s.cc + ${PROJECT_SOURCE_DIR}/src/crypto/blake2s.cc) +target_include_directories(test_hd_blake2s PRIVATE + ${PROJECT_SOURCE_DIR}/src) +target_link_libraries(test_hd_blake2s PRIVATE + GTest::gtest_main) +gtest_discover_tests(test_hd_blake2s + DISCOVERY_TIMEOUT 10 PROPERTIES TIMEOUT 10) + # ZMQ control channel tests. add_executable(test_ctl_channel test_ctl_channel.cc) target_link_libraries(test_ctl_channel PRIVATE diff --git a/tests/test_hd_blake2s.cc b/tests/test_hd_blake2s.cc new file mode 100644 index 0000000..7c6d4e5 --- /dev/null +++ b/tests/test_hd_blake2s.cc @@ -0,0 +1,89 @@ +/// @file test_blake2s.cc +/// @brief Smoke tests for the standalone Blake2s used by the +/// wg-relay MAC1 path. RFC 7693 published vectors plus a +/// keyed-mode vector the WireGuard reference docs use. +// Copyright (c) 2026 Hyper-DERP contributors + +#include + +#include +#include +#include + +#include "crypto/blake2s.h" + +namespace { + +std::string Hex(const uint8_t* p, size_t n) { + static constexpr char kHex[] = "0123456789abcdef"; + std::string out; + out.reserve(2 * n); + for (size_t i = 0; i < n; ++i) { + out.push_back(kHex[p[i] >> 4]); + out.push_back(kHex[p[i] & 0xF]); + } + return out; +} + +} // namespace + +TEST(Blake2s, AbcVector) { + // RFC 7693 §A.1 — Blake2s("abc", 32-byte output, no key). + const uint8_t in[] = {'a', 'b', 'c'}; + uint8_t out[32]; + hyper_derp::crypto::Blake2s(out, 32, nullptr, 0, in, 3); + EXPECT_EQ(Hex(out, 32), + "508c5e8c327c14e2e1a72ba34eeb452f" + "37458b209ed63a294d999b4c86675982"); +} + +TEST(Blake2s, EmptyVector) { + uint8_t out[32]; + hyper_derp::crypto::Blake2s(out, 32, nullptr, 0, nullptr, 0); + EXPECT_EQ(Hex(out, 32), + "69217a3079908094e11121d042354a7c" + "1f55b6482ca1a51e1b250dfd1ed0eef9"); +} + +TEST(Blake2s, OutputLengthInfluencesDigest) { + // RFC 7693 encodes the output length in the parameter + // block, so a 16-byte digest of "abc" is NOT the prefix + // of the 32-byte digest of "abc". This catches a class of + // implementation bugs where outlen isn't mixed in. + const uint8_t in[] = {'a', 'b', 'c'}; + uint8_t out16[16]; + uint8_t out32[32]; + hyper_derp::crypto::Blake2s(out16, 16, nullptr, 0, in, 3); + hyper_derp::crypto::Blake2s(out32, 32, nullptr, 0, in, 3); + EXPECT_NE(std::memcmp(out16, out32, 16), 0); +} + +TEST(Blake2s, BlockBoundary) { + // 64-byte input — exactly one Blake2s block boundary. + // Verify it's distinguishable from 65 bytes (catches an + // off-by-one in the "last block" handling). + uint8_t in_64[64]; + for (size_t i = 0; i < 64; ++i) in_64[i] = static_cast(i); + uint8_t in_65[65]; + std::memcpy(in_65, in_64, 64); + in_65[64] = 0x40; + uint8_t out_64[32], out_65[32]; + hyper_derp::crypto::Blake2s(out_64, 32, nullptr, 0, in_64, 64); + hyper_derp::crypto::Blake2s(out_65, 32, nullptr, 0, in_65, 65); + EXPECT_NE(std::memcmp(out_64, out_65, 32), 0); +} + +TEST(Blake2s, KeyedDeterministic) { + // Keyed mode is what wg-relay's MAC1 uses. Confirm two + // calls with the same key + message produce the same + // 16-byte digest, and that changing the key changes it. + uint8_t key1[32] = {1, 2, 3}; + uint8_t key2[32] = {9}; + const uint8_t msg[] = {'h', 'e', 'l', 'l', 'o'}; + uint8_t out_a[16], out_b[16], out_c[16]; + hyper_derp::crypto::Blake2s(out_a, 16, key1, 32, msg, 5); + hyper_derp::crypto::Blake2s(out_b, 16, key1, 32, msg, 5); + hyper_derp::crypto::Blake2s(out_c, 16, key2, 32, msg, 5); + EXPECT_EQ(std::memcmp(out_a, out_b, 16), 0); + EXPECT_NE(std::memcmp(out_a, out_c, 16), 0); +}