From 74e5cab25590e67d9ddcc608841a363893555632 Mon Sep 17 00:00:00 2001 From: keirsalterego Date: Mon, 15 Jun 2026 11:51:07 +0530 Subject: [PATCH] fix(proxy): exactly-once on ambiguous EDR failure, rollback retry + terminal state Four production-grade fixes in the containment proxy, each confirmed by the prior audit. T63 / CNT-05 (P1): the nonce was released on ANY EdrError, including a transport drop AFTER the request reached the EDR, which could double-execute a containment on retry. EdrError now distinguishes a pre-send transport failure (connection refused, DNS: the action provably did NOT run, safe to release the nonce) from an ambiguous one (timeout after send, lost response: the action MAY have run). reqwest errors are classified at the dispatch boundary via is_connect(). On /execute an ambiguous failure now HOLDS the nonce and returns a distinct needs_reconciliation state (may_have_executed:true) plus an audit entry that makes clear the action may have executed; only a provable non-event releases the nonce for a clean retry. T64 / RB-01 (P1): a transient EDR 5xx / ambiguous timeout on /rollback used to return a generic 502 and lean entirely on the Python pager. /rollback now retries the idempotent contain/lift up to ROLLBACK_MAX_ATTEMPTS with a short linear backoff (4xx and pre-send are not retried), and on exhaustion emits a distinct terminal ROLLBACK_FAILED audit entry + response body the pager keys on. Audit-before-act ordering is preserved. T79 PRX-08 (P3): the proxy booted on a signing secret < 32 bytes with only a warning. It now fails closed (panic) in production, mirroring the no-secret panic, and keeps the warning for dev/CI. The check is extracted as a pure, unit-tested function. T79 PRX-07 (P3): the per-tenant path rebuilt a fresh CrowdstrikeClient per request, so the per-instance token cache never helped and every action paid a full OAuth round-trip. Tokens are now cached process-wide, keyed by credential identity (base_url + client_id + a hash of the secret, never the plaintext secret), so consecutive actions for the same tenant reuse a valid bearer while tenants stay isolated. Adds tests for the pre-send vs ambiguous classification, the nonce hold/release decision, the rollback retry + terminal ROLLBACK_FAILED path, the PRX-08 fail-closed rule, and the PRX-07 cross-request token reuse + isolation. --- src/edr.rs | 418 ++++++++++++++++++++++++++++-- src/main.rs | 718 ++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 998 insertions(+), 138 deletions(-) diff --git a/src/edr.rs b/src/edr.rs index aafd10d..65f4962 100644 --- a/src/edr.rs +++ b/src/edr.rs @@ -72,13 +72,14 @@ //! so a retry can run on fresh state. use std::env; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use std::time::{Duration, Instant}; +use dashmap::DashMap; use reqwest::Client; use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; use thiserror::Error; -use tokio::sync::Mutex; use tracing::info; use crate::actions::{ActionDirection, ActionType}; @@ -86,6 +87,29 @@ use crate::actions::{ActionDirection, ActionType}; /// Default HTTP timeout for EDR calls, in seconds. const DEFAULT_TIMEOUT_SECS: u64 = 30; +/// Process-wide CrowdStrike OAuth bearer cache, keyed by credential identity +/// (PRX-07). +/// +/// The per-tenant dispatch path builds a fresh `CrowdstrikeClient` for every +/// request (so tenant A always acts with tenant A's key), which meant the old +/// per-instance token cache never survived a single request: every action paid +/// a full OAuth round-trip. This shared cache lives for the life of the process +/// and is keyed by a stable, non-secret identity (`base_url` + `client_id` + a +/// hash of the secret), so consecutive actions for the same tenant reuse a valid +/// bearer instead of re-authenticating. +/// +/// Keying on a credential identity rather than per-client keeps tenants +/// isolated: tenant A's entry can never serve tenant B a token, and a rotated +/// secret produces a different key (the old entry simply expires out). The +/// cached value is a short-lived bearer + a refresh deadline; secrets are never +/// stored as map keys (only a hash) and the cache is never logged. +static CROWDSTRIKE_TOKEN_CACHE: OnceLock> = OnceLock::new(); + +/// Accessor for the lazily-initialised shared token cache. +fn token_cache() -> &'static DashMap { + CROWDSTRIKE_TOKEN_CACHE.get_or_init(DashMap::new) +} + /// Errors that can happen during an EDR dispatch. /// /// Callers treat the transport-shaped variants the same on the wire (502 @@ -105,8 +129,19 @@ pub enum EdrError { #[error("edr returned server error {status}: {body}")] ServerError { status: u16, body: String }, - /// Transport error (DNS, TCP, TLS, timeout). - #[error("edr transport error: {0}")] + /// Transport error that definitively happened BEFORE the request bytes + /// reached the EDR: connection refused, DNS failure, TLS handshake failure. + /// The side effect provably did NOT happen, so the caller can safely + /// release the nonce and allow a clean retry (CNT-05 / T63). + #[error("edr transport error (pre-send): {0}")] + TransportPreSend(String), + + /// Transport error that is AMBIGUOUS about whether the EDR acted: a timeout + /// after the request was sent, a dropped/lost response, a body-read error. + /// The action MAY have executed, so the caller must NOT release the nonce + /// (releasing would let a retry double-execute the containment). Surfaced + /// as a needs-reconciliation state instead (CNT-05 / T63). + #[error("edr transport error (ambiguous, may have executed): {0}")] Transport(String), /// EDR returned a successful HTTP status but the response body @@ -133,6 +168,56 @@ pub enum EdrError { }, } +impl EdrError { + /// True when this error proves the EDR side effect did NOT happen, so the + /// caller may safely release the nonce and allow a clean retry (CNT-05). + /// + /// Only `TransportPreSend` (connection refused, DNS, TLS handshake) and a + /// pre-send config failure (`Misconfigured`, `Unsupported`) qualify: in all + /// of those no request bytes reached the EDR. `Transport` (ambiguous), a + /// 4xx/5xx the EDR returned, and an unexpected response all leave open the + /// possibility the action executed, so they are NOT release-safe. + /// + /// `Misconfigured`/`Unsupported` are listed here for completeness, but the + /// handler short-circuits both before dispatch, so in practice only the two + /// transport variants drive the release decision at the dispatch boundary. + pub fn side_effect_definitely_did_not_happen(&self) -> bool { + matches!( + self, + EdrError::TransportPreSend(_) + | EdrError::Misconfigured(_) + | EdrError::Unsupported { .. } + ) + } +} + +/// Classify a reqwest error from a `.send()` call into a pre-send (safe to +/// release the nonce) or ambiguous (the action may have executed) transport +/// error (CNT-05 / T63). +/// +/// The distinction is which side of "the request reached the EDR" the failure +/// fell on: +/// +/// - `is_connect()` means the TCP/TLS connection was never established, so the +/// request bytes never left for the EDR. Provably no side effect: pre-send. +/// - A timeout (`is_timeout()`), a lost/dropped response, or a body-read error +/// can all fire AFTER the request was written to the socket, when the EDR may +/// already have applied the action. Ambiguous: we must not release the nonce. +/// +/// reqwest collapses DNS resolution failures into the connect phase, so they +/// land in `is_connect()` and are correctly treated as pre-send. We bias the +/// unknown/uncategorised case toward ambiguous (the conservative direction): a +/// misclassified ambiguous error costs a human reconciliation, while a +/// misclassified pre-send error costs a double-execute, which is the outcome +/// this whole change exists to prevent. +fn classify_transport_error(err: &reqwest::Error) -> EdrError { + if err.is_connect() { + EdrError::TransportPreSend(err.to_string()) + } else { + EdrError::Transport(err.to_string()) + } +} + /// Which EDR a per-tenant credential targets. /// /// Carried on `EdrCredentials` so the central proxy can serve tenants on @@ -373,16 +458,21 @@ pub fn check_supported( /// CrowdStrike Falcon Real Time Response client. /// -/// Holds a `reqwest::Client` and a mutex-guarded token cache. We use a -/// single Mutex around the cache (rather than per-field locks) because -/// the critical section is microseconds long and contention is bounded -/// by request rate (low for human-approved actions). +/// Holds a `reqwest::Client` and a `cache_key` into the process-wide +/// `CROWDSTRIKE_TOKEN_CACHE` (PRX-07). The token cache is shared across requests +/// rather than per-instance, because the per-tenant dispatch path builds a fresh +/// client per request: a per-instance cache would be discarded immediately and +/// every action would re-authenticate. Keying the shared cache by credential +/// identity reuses a valid bearer across a tenant's consecutive actions while +/// keeping tenants isolated. pub struct CrowdstrikeClient { http: Client, base_url: String, client_id: String, client_secret: String, - token: Mutex>, + /// Stable, non-secret key into the shared token cache. Derived from + /// `base_url` + `client_id` + a hash of the secret (see `token_cache_key`). + cache_key: String, } #[derive(Clone)] @@ -392,6 +482,20 @@ struct CachedToken { refresh_at: Instant, } +/// Build the shared-cache key for a CrowdStrike credential. +/// +/// Combines `base_url` and `client_id` (the credential's identity) with a +/// SHA-256 hash of the secret, so a rotated secret keys to a different entry +/// (the stale token is never reused) without ever putting the plaintext secret +/// in a map key. Two requests for the same tenant credential produce the same +/// key and therefore share a bearer. +fn token_cache_key(base_url: &str, client_id: &str, client_secret: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(client_secret.as_bytes()); + let secret_hash = hex::encode(hasher.finalize()); + format!("{base_url}\u{0}{client_id}\u{0}{secret_hash}") +} + /// Shape of the OAuth2 token response from /// `POST /oauth2/token`. We only keep the fields we use. #[derive(Deserialize)] @@ -432,26 +536,33 @@ impl CrowdstrikeClient { .build() .map_err(|e| EdrError::Misconfigured(format!("http client build failed: {e}")))?; + let cache_key = token_cache_key(&base_url, &client_id, &client_secret); Ok(Self { http, base_url, client_id, client_secret, - token: Mutex::new(None), + cache_key, }) } /// Get a bearer token, refreshing if necessary. /// - /// Concurrency: only one task can hold the lock at a time, so - /// concurrent first-callers serialize on the token fetch. After the - /// first fetch, all callers within the cache window get the cached value - /// without re-fetching. A per-request client (the per-tenant path) lives - /// for one action, so it fetches once and is dropped. + /// Reads from and writes to the process-wide `CROWDSTRIKE_TOKEN_CACHE` + /// (PRX-07), keyed by `self.cache_key` (credential identity). Consecutive + /// actions for the same tenant therefore reuse a valid bearer across + /// requests, even though each per-tenant request builds a fresh client. A + /// cache hit returns with zero network calls; a miss or expired entry + /// fetches once and writes the new token back for the next request. + /// + /// Concurrency: a brief race where two first-callers both miss the cache and + /// both fetch is harmless, both write a valid bearer and the last writer + /// wins. We do not hold a lock across the network fetch (DashMap entries are + /// not held across the await), so a slow CrowdStrike never serializes every + /// other tenant's token fetch behind it. async fn bearer_token(&self) -> Result { - let mut guard = self.token.lock().await; - - if let Some(cached) = guard.as_ref() { + // Fast path: a live cached bearer for this exact credential identity. + if let Some(cached) = token_cache().get(&self.cache_key) { if Instant::now() < cached.refresh_at { return Ok(cached.bearer.clone()); } @@ -459,6 +570,12 @@ impl CrowdstrikeClient { // Need a fresh token. let url = format!("{}/oauth2/token", self.base_url); + // A failed token fetch is ALWAYS release-safe regardless of pre/post + // send: the containment device-action call is never made without a + // bearer, so the action provably did not happen. We classify a connect + // failure as pre-send and map any other token-fetch transport error to + // pre-send too, because the only side effect that matters (the action) + // could not have run. The token endpoint itself is read-only. let resp = self .http .post(&url) @@ -468,7 +585,7 @@ impl CrowdstrikeClient { ]) .send() .await - .map_err(|e| EdrError::Transport(e.to_string()))?; + .map_err(|e| EdrError::TransportPreSend(e.to_string()))?; let status = resp.status(); if !status.is_success() { @@ -488,11 +605,13 @@ impl CrowdstrikeClient { // CrowdStrike, and to avoid the cliff where the token expires // mid-flight. let lifetime = Duration::from_secs((parsed.expires_in * 8) / 10); - let new = CachedToken { - bearer: parsed.access_token.clone(), - refresh_at: Instant::now() + lifetime, - }; - *guard = Some(new); + token_cache().insert( + self.cache_key.clone(), + CachedToken { + bearer: parsed.access_token.clone(), + refresh_at: Instant::now() + lifetime, + }, + ); Ok(parsed.access_token) } @@ -524,6 +643,12 @@ impl CrowdstrikeClient { ids: vec![host], }; + // This is THE side-effecting call. Classify a transport failure here as + // pre-send (connection refused / DNS: the action never left) vs + // ambiguous (timeout after send / lost response: the action may have + // run). The nonce-release decision in the handler keys off that + // distinction so an ambiguous failure never lets a retry double-isolate + // a host (CNT-05 / T63). let resp = self .http .post(&url) @@ -531,7 +656,7 @@ impl CrowdstrikeClient { .json(&body) .send() .await - .map_err(|e| EdrError::Transport(e.to_string()))?; + .map_err(|e| classify_transport_error(&e))?; let status = resp.status(); if status.is_success() { @@ -653,6 +778,9 @@ impl SentinelOneClient { filter: S1Filter { uuids: vec![host] }, }; + // THE side-effecting call. Same pre-send vs ambiguous classification as + // CrowdStrike so the handler's nonce-release decision is correct for S1 + // too (CNT-05 / T63). let resp = self .http .post(&url) @@ -660,7 +788,7 @@ impl SentinelOneClient { .json(&body) .send() .await - .map_err(|e| EdrError::Transport(e.to_string()))?; + .map_err(|e| classify_transport_error(&e))?; let status = resp.status(); if status.is_success() { @@ -981,6 +1109,10 @@ mod tests { // points CrowdStrike at an unreachable base_url, so a transport // error proves the per-tenant client ran INSTEAD of the noop // fallback. If the fallback had been used we would get Ok. + // + // The failure is a connection refused on the OAuth token fetch, i.e. a + // pre-send transport error: no device-action call was ever made, so the + // action provably did not happen and the nonce is release-safe (CNT-05). let fallback = EdrClient::Noop; let creds = cs_creds(); let err = dispatch( @@ -993,11 +1125,88 @@ mod tests { .await .expect_err("per-tenant CrowdStrike client should attempt a real call and fail transport"); assert!( - matches!(err, EdrError::Transport(_)), - "expected transport error from per-tenant client, got {err:?}" + matches!(err, EdrError::TransportPreSend(_)), + "expected a pre-send transport error from per-tenant client, got {err:?}" + ); + assert!( + err.side_effect_definitely_did_not_happen(), + "connection refused before the action call means the side effect did not happen" + ); + } + + #[tokio::test] + async fn classify_connect_failure_is_pre_send_and_release_safe() { + // A connection refused (nothing listening) is reqwest::Error::is_connect: + // the request bytes never left, so it must classify as pre-send and be + // release-safe (CNT-05 / T63). We provoke a genuine connect failure by + // posting to a closed loopback port with the async client. + let client = Client::builder() + .timeout(Duration::from_secs(2)) + .build() + .expect("client"); + let err = client + .post("http://127.0.0.1:1/oauth2/token") + .send() + .await + .expect_err("connection to a closed port must fail"); + assert!( + err.is_connect(), + "sanity: a closed-port failure should be a connect error, got {err:?}" + ); + let classified = classify_transport_error(&err); + assert!( + matches!(classified, EdrError::TransportPreSend(_)), + "a connect failure must classify as pre-send, got {classified:?}" + ); + assert!( + classified.side_effect_definitely_did_not_happen(), + "a pre-send transport error must be release-safe" + ); + } + + #[test] + fn ambiguous_transport_is_not_release_safe() { + // An ambiguous transport error (timeout after send / lost response) MUST + // NOT be release-safe: the action may have executed, so releasing the + // nonce would let a retry double-execute (CNT-05 / T63). + let err = EdrError::Transport("timed out reading response".to_string()); + assert!( + !err.side_effect_definitely_did_not_happen(), + "an ambiguous transport error must not be treated as release-safe" ); } + #[test] + fn server_and_client_errors_are_not_release_safe() { + // A 4xx/5xx the EDR returned means it WAS contacted; the action may have + // taken effect even on a 5xx. Neither is release-safe. + let server = EdrError::ServerError { + status: 503, + body: String::new(), + }; + let client_err = EdrError::ClientError { + status: 429, + body: String::new(), + }; + assert!(!server.side_effect_definitely_did_not_happen()); + assert!(!client_err.side_effect_definitely_did_not_happen()); + } + + #[test] + fn unsupported_and_misconfigured_are_release_safe_pre_dispatch() { + // Both are decided before any network call, so they are release-safe by + // construction (the handler short-circuits Unsupported separately, but + // the predicate must still report the truth). + let unsupported = EdrError::Unsupported { + action: ActionType::ProcessKill, + provider: "crowdstrike", + detail: "no mapping", + }; + let misconfigured = EdrError::Misconfigured("no base_url".to_string()); + assert!(unsupported.side_effect_definitely_did_not_happen()); + assert!(misconfigured.side_effect_definitely_did_not_happen()); + } + #[tokio::test] async fn unusable_credential_falls_back_to_global() { // A credential whose secret is blank is NOT usable, so dispatch must @@ -1140,4 +1349,157 @@ mod tests { "S1 must receive exactly the approved actions, in order, and nothing for PROCESS_KILL" ); } + + // ── CrowdStrike token cache (PRX-07) ─────────────────────────────────── + // + // The shared `CROWDSTRIKE_TOKEN_CACHE` must let consecutive actions for the + // same tenant credential reuse a bearer ACROSS requests, even though each + // per-tenant request builds a fresh CrowdstrikeClient. A different + // credential must NOT reuse another's token. + + use std::sync::atomic::{AtomicUsize, Ordering}; + + /// A CrowdStrike-shaped mock that counts how many times `/oauth2/token` is + /// hit, so a test can prove the second action reused the cached bearer + /// instead of re-authenticating. + async fn spawn_counting_mock_crowdstrike() -> (String, Arc) { + use axum::extract::State; + use axum::response::Json; + use axum::Router; + + let token_hits = Arc::new(AtomicUsize::new(0)); + + async fn token(State(hits): State>) -> Json { + hits.fetch_add(1, Ordering::SeqCst); + Json(serde_json::json!({ + "access_token": "mock-bearer", + "token_type": "bearer", + "expires_in": 1800, + })) + } + + async fn device_action() -> Json { + Json(serde_json::json!({ "resources": [], "errors": [] })) + } + + let app = Router::new() + .route("/oauth2/token", axum::routing::post(token)) + .route( + "/devices/entities/devices-actions/v2", + axum::routing::post(device_action), + ) + .with_state(token_hits.clone()); + + let listener = tokio::net::TcpListener::bind("127.0.0.1:0") + .await + .expect("bind mock cs"); + let addr: SocketAddr = listener.local_addr().expect("mock cs addr"); + tokio::spawn(async move { + axum::serve(listener, app).await.expect("mock cs serve"); + }); + (format!("http://{addr}"), token_hits) + } + + #[tokio::test] + async fn crowdstrike_token_is_cached_across_requests_for_same_credential() { + // Two actions for the SAME tenant credential, each through a fresh + // CrowdstrikeClient (built per dispatch). The first fetches a token, the + // second must reuse it from the shared cache: exactly one token fetch + // for two actions. This is the PRX-07 fix; before it, every action paid + // a full OAuth round-trip. + let (base_url, token_hits) = spawn_counting_mock_crowdstrike().await; + let fallback = EdrClient::Noop; + // Unique client_id so this test's cache entry never collides with + // another test's (the cache is process-global). + let client_id = format!("tenant-cache-{}", std::process::id()); + let creds = EdrCredentials { + provider: EdrProvider::Crowdstrike, + api_key: client_id, + api_secret: Some("secret-1".to_string()), + base_url: Some(base_url), + }; + + for dir in [ActionDirection::Apply, ActionDirection::Reverse] { + dispatch( + &fallback, + Some(&creds), + ActionType::HostIsolation, + dir, + "device-1", + ) + .await + .expect("dispatch should succeed against the mock"); + } + + assert_eq!( + token_hits.load(Ordering::SeqCst), + 1, + "the second action must reuse the cached bearer: exactly one OAuth fetch for two \ + actions (PRX-07)" + ); + } + + #[tokio::test] + async fn crowdstrike_token_cache_does_not_leak_across_distinct_credentials() { + // Two DIFFERENT credentials (distinct secret -> distinct cache key) must + // each fetch their own token: a tenant never serves another tenant a + // bearer from the shared cache. + let (base_url, token_hits) = spawn_counting_mock_crowdstrike().await; + let fallback = EdrClient::Noop; + let id = format!("tenant-isolate-{}", std::process::id()); + + let creds_a = EdrCredentials { + provider: EdrProvider::Crowdstrike, + api_key: id.clone(), + api_secret: Some("secret-A".to_string()), + base_url: Some(base_url.clone()), + }; + let creds_b = EdrCredentials { + provider: EdrProvider::Crowdstrike, + api_key: id, + api_secret: Some("secret-B".to_string()), + base_url: Some(base_url), + }; + + dispatch( + &fallback, + Some(&creds_a), + ActionType::HostIsolation, + ActionDirection::Apply, + "device-1", + ) + .await + .expect("creds A dispatch"); + dispatch( + &fallback, + Some(&creds_b), + ActionType::HostIsolation, + ActionDirection::Apply, + "device-1", + ) + .await + .expect("creds B dispatch"); + + assert_eq!( + token_hits.load(Ordering::SeqCst), + 2, + "distinct credentials must each fetch their own token; no cross-credential reuse" + ); + } + + #[test] + fn token_cache_key_is_stable_and_secret_sensitive() { + // Same identity -> same key (so the cache hits). Different secret -> + // different key (so a rotated secret never reuses a stale token). The + // plaintext secret must NOT appear in the key. + let k1 = token_cache_key("https://api.example", "id-1", "secret-xyz"); + let k2 = token_cache_key("https://api.example", "id-1", "secret-xyz"); + let k3 = token_cache_key("https://api.example", "id-1", "rotated-secret"); + assert_eq!(k1, k2, "same credential identity must produce a stable key"); + assert_ne!(k1, k3, "a rotated secret must produce a different key"); + assert!( + !k1.contains("secret-xyz"), + "the plaintext secret must never appear in the cache key" + ); + } } diff --git a/src/main.rs b/src/main.rs index 3d04529..b748ce2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -74,6 +74,26 @@ mod nonce; /// timestamps caused by client clock skew or deliberate manipulation. const REPLAY_WINDOW_SECONDS: i64 = 30; +/// Total number of `/rollback` dispatch attempts before we give up and emit the +/// terminal `ROLLBACK_FAILED` state (RB-01 / T64). One initial try plus +/// `ROLLBACK_MAX_ATTEMPTS - 1` retries. Small on purpose: contain/lift are +/// idempotent, so a couple of quick retries absorb a transient EDR 5xx or a +/// brief network blip without turning the request handler into a long-running +/// pager. A persistent failure still surfaces promptly to the human pager. +const ROLLBACK_MAX_ATTEMPTS: u32 = 3; + +/// Base backoff between `/rollback` retry attempts. The delay grows linearly +/// with the attempt number (attempt 1 waits one base, attempt 2 waits two), a +/// gentle backoff that stays well inside a normal request timeout. Kept short +/// because a human approved this rollback and is waiting on the result. +const ROLLBACK_RETRY_BACKOFF: std::time::Duration = std::time::Duration::from_millis(250); + +/// Minimum acceptable length (bytes) of the proxy signing secret (PRX-08). +/// 32 bytes = 256 bits, matching the HMAC-SHA256 output width. A shorter secret +/// weakens the one check that authorizes a real EDR action, so it fails closed +/// in production. +const MIN_SECRET_LEN: usize = 32; + /// Default per-tenant request budget per one-second window. One tenant's /// containment burst can consume at most this many slots per second; once it /// is exhausted, only THAT tenant gets 429s. Every other tenant's @@ -349,6 +369,40 @@ struct ExecuteResponse { simulated: bool, } +/// Distinct status string the `/rollback` path returns when every internal +/// retry is exhausted and the lift may not have taken effect. The Python pager +/// keys on this exact string (and the `502` status) to page a human for manual +/// reconciliation, distinct from a transient blip that the bounded retry +/// already recovered from (RB-01 / T64). +const ROLLBACK_FAILED_STATUS: &str = "rollback_failed"; + +/// Distinct status string the `/execute` and `/rollback` paths return when an +/// EDR transport failure is AMBIGUOUS about whether the action ran (timeout +/// after send, lost response). The nonce is deliberately NOT released, so a +/// blind retry cannot double-execute; the Python side keys on this to surface a +/// human-reconciliation task instead of an auto-retry (CNT-05 / T63). +const NEEDS_RECONCILIATION_STATUS: &str = "needs_reconciliation"; + +/// Body for the error responses (`/execute` ambiguous, `/rollback` terminal +/// failure). Distinct from `ExecuteResponse` so the Python caller can pattern +/// match on `status` without it ever being confused with a success body. The +/// `may_have_executed` flag is the load-bearing field: true means the EDR may +/// already have acted and the nonce was held back to block a double-execute. +#[derive(Debug, Serialize, Deserialize, Clone)] +struct ActionFailureResponse { + /// One of `ROLLBACK_FAILED_STATUS` or `NEEDS_RECONCILIATION_STATUS`. + status: String, + + /// True when the action MAY have executed (ambiguous transport failure or + /// a rollback whose final attempt was ambiguous). When true the nonce was + /// NOT released, so a naive retry will be deduped rather than re-run. + may_have_executed: bool, + + /// Honesty label echoed from the request (demo/mock fleet), same meaning + /// as on `ExecuteResponse`. + simulated: bool, +} + /// Query parameters for the audit export endpoint. #[derive(Debug, Deserialize)] struct ExportQuery { @@ -386,11 +440,7 @@ async fn health() -> Json { /// /// Thin wrapper over `run_action` with `ActionDirection::Apply`. See /// `run_action` for the full step-numbered lifecycle. -async fn execute( - state: State, - headers: HeaderMap, - body: Bytes, -) -> Result, StatusCode> { +async fn execute(state: State, headers: HeaderMap, body: Bytes) -> Response { run_action(state, headers, body, actions::ActionDirection::Apply).await } @@ -402,11 +452,7 @@ async fn execute( /// difference is `ActionDirection::Reverse`, which makes the EDR client call /// the inverse vendor action, and the audit `action_type` is prefixed /// `ROLLBACK_` so the trail names what was undone. -async fn rollback( - state: State, - headers: HeaderMap, - body: Bytes, -) -> Result, StatusCode> { +async fn rollback(state: State, headers: HeaderMap, body: Bytes) -> Response { run_action(state, headers, body, actions::ActionDirection::Reverse).await } @@ -434,7 +480,7 @@ async fn run_action( headers: HeaderMap, body: Bytes, direction: actions::ActionDirection, -) -> Result, StatusCode> { +) -> Response { // ─ Step 1: Verify HMAC on the RAW bytes we received. ────────────── // // Critical correctness point: we verify against `body` (the exact @@ -443,14 +489,16 @@ async fn run_action( // signatures, and historically did - that bug masked the real // verification because differing serialization made every signature // mismatch indistinguishable from an invalid signature. - let signature = headers + let Some(signature) = headers .get("X-Vyrox-Signature") .and_then(|v| v.to_str().ok()) - .ok_or(StatusCode::UNAUTHORIZED)?; + else { + return StatusCode::UNAUTHORIZED.into_response(); + }; if let Err(err) = hmac::verify_signature(state.hmac_secret.as_bytes(), &body, signature) { warn!(error = %err, "signature verification failed"); - return Err(StatusCode::UNAUTHORIZED); + return StatusCode::UNAUTHORIZED.into_response(); } // ─ Step 2: Parse JSON. ──────────────────────────────────────────── @@ -458,11 +506,12 @@ async fn run_action( // Only after the HMAC passes do we trust the body enough to parse // it. Parsing before verification would expose any serde panic / // pathological input to unauthenticated callers. - let payload: ExecuteRequest = - serde_json::from_slice(&body).map_err(|_| StatusCode::BAD_REQUEST)?; + let Ok(payload) = serde_json::from_slice::(&body) else { + return StatusCode::BAD_REQUEST.into_response(); + }; if payload.request_id.trim().is_empty() { - return Err(StatusCode::BAD_REQUEST); + return StatusCode::BAD_REQUEST.into_response(); } // ─ Step 2b: Per-tenant rate limit. ──────────────────────────────── @@ -477,11 +526,13 @@ async fn run_action( tenant_id = %payload.tenant_id, "per-tenant rate limit exceeded" ); - return Err(StatusCode::TOO_MANY_REQUESTS); + return StatusCode::TOO_MANY_REQUESTS.into_response(); } // ─ Step 3: Replay window. ───────────────────────────────────────── - check_replay_window(payload.approved_at)?; + if let Err(status) = check_replay_window(payload.approved_at) { + return status.into_response(); + } // ─ Step 4: Nonce dedup. ─────────────────────────────────────────── // @@ -490,30 +541,31 @@ async fn run_action( // result and the EDR is NOT called again. A nonce-store transport // error (Redis down) fails CLOSED with a 503: skipping dedup could // double-execute a containment, so we refuse rather than guess. - let claim = state - .nonces - .claim_or_replay(&payload.request_id) - .await - .map_err(|err| { + let claim = match state.nonces.claim_or_replay(&payload.request_id).await { + Ok(claim) => claim, + Err(err) => { warn!(error = %err, "nonce store unavailable; failing closed"); - StatusCode::SERVICE_UNAVAILABLE - })?; + return StatusCode::SERVICE_UNAVAILABLE.into_response(); + } + }; match claim { nonce::Outcome::FreshClaim => { /* fall through to execution */ } nonce::Outcome::AlreadyExecuted { cached_response_json, } => { info!(request_id = %payload.request_id, "replaying cached response"); - let cached: ExecuteResponse = serde_json::from_str(&cached_response_json) - .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; - return Ok(Json(ExecuteResponse { + let Ok(cached) = serde_json::from_str::(&cached_response_json) else { + return StatusCode::INTERNAL_SERVER_ERROR.into_response(); + }; + return Json(ExecuteResponse { status: "replayed".to_string(), simulated: cached.simulated, - })); + }) + .into_response(); } nonce::Outcome::InFlight => { warn!(request_id = %payload.request_id, "duplicate while in-flight"); - return Err(StatusCode::CONFLICT); + return StatusCode::CONFLICT.into_response(); } } @@ -540,7 +592,7 @@ async fn run_action( // the underlying issue (disk full, perm error) is fixed. warn!(error = %err, "audit write failed; releasing nonce claim"); release_nonce(&state.nonces, &payload.request_id).await; - return Err(StatusCode::INTERNAL_SERVER_ERROR); + return StatusCode::INTERNAL_SERVER_ERROR.into_response(); } // ─ Step 6: Execute. ─────────────────────────────────────────────── @@ -557,81 +609,44 @@ async fn run_action( // cannot faithfully perform (e.g. PROCESS_KILL on CrowdStrike) must fail // loudly with a 501 before any network call, never be silently substituted. // The check is pure (action-name mappers only, no client, no network). + // + // contain/lift are idempotent, so the Reverse (/rollback) direction wraps + // the dispatch in a small bounded retry (RB-01 / T64): a transient EDR 5xx + // or a brief blip is absorbed in-proxy rather than immediately paging a + // human. /execute keeps a single attempt; a failed isolate is retried at the + // approval layer, and silently re-isolating is not something we want this + // handler doing on its own. let success_status = match direction { actions::ActionDirection::Apply => "executed", actions::ActionDirection::Reverse => "rolled_back", }; - let supported = edr::check_supported( + + // Pure supportability pre-flight: reject an action the provider cannot + // faithfully perform BEFORE any network call (and before the retry loop), so + // an unsupported action is one 501 with zero EDR calls, never retried and + // never silently substituted. `dispatch` re-checks internally, this keeps + // the decision explicit at the handler and out of the retry path. + if let Err(err) = edr::check_supported( &state.edr, payload.edr_credentials.as_ref(), payload.action_type, direction, - ); - let outcome: Result = match supported { - Err(err) => Err(err), - Ok(()) => { - // Real dispatch. The per-tenant credentials on the request take - // precedence over the global env fallback (E7). The EDR client - // owns its own retries, timeouts, and error mapping. We echo the - // request's `simulated` honesty label back unchanged. - edr::dispatch( - &state.edr, - payload.edr_credentials.as_ref(), - payload.action_type, - direction, - &payload.host, - ) - .await - .map(|()| ExecuteResponse { - status: success_status.to_string(), - simulated: payload.simulated, - }) - } - }; - let response = match outcome { - Ok(response) => response, - // An action the provider cannot faithfully perform is 501 Not - // Implemented (never silently substituted with a different action), - // in dry-run and live alike; every other failure is 502 Bad - // Gateway. Either way we append a failure audit entry so the trail - // records that the intent did NOT happen, then release the nonce so - // a retry runs on fresh state. + ) { + return handle_dispatch_failure(&state, &payload, direction, &audit_action, err).await; + } + + let dispatch_result = dispatch_with_retry(&state, &payload, direction).await; + + match dispatch_result { + Ok(()) => { /* fall through to success caching + 200 */ } Err(err) => { - let status = match err { - edr::EdrError::Unsupported { .. } => StatusCode::NOT_IMPLEMENTED, - _ => StatusCode::BAD_GATEWAY, - }; - warn!( - request_id = %payload.request_id, - ?direction, - error = %err, - status = status.as_u16(), - "EDR action failed; auditing failure and releasing nonce claim" - ); - // The step-5 entry recorded the INTENT. Without this - // companion entry the trail would read as if the action - // happened. Best-effort: the failure status is returned - // regardless, and a write error is logged, not swallowed - // into a fake success. - let failure_entry = audit::build_entry( - payload.tenant_id.clone(), - format!("FAILED_{audit_action}"), - payload.host.clone(), - payload.approved_by.clone(), - payload.simulated, - ); - if let Err(audit_err) = - audit::append_audit(&state.audit_log_path, &state.audit_chain, failure_entry).await - { - warn!( - request_id = %payload.request_id, - error = %audit_err, - "failed to write failure audit entry" - ); - } - release_nonce(&state.nonces, &payload.request_id).await; - return Err(status); + return handle_dispatch_failure(&state, &payload, direction, &audit_action, err).await; } + } + + let response = ExecuteResponse { + status: success_status.to_string(), + simulated: payload.simulated, }; // ─ Step 7: Cache the response for future retries. ──────────────── @@ -643,8 +658,9 @@ async fn run_action( // idempotent EDR action, which the replay window and EDR idempotency both // bound, and which is strictly safer than reporting a failure for an action // that succeeded. - let cache_payload = - serde_json::to_string(&response).map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + let Ok(cache_payload) = serde_json::to_string(&response) else { + return StatusCode::INTERNAL_SERVER_ERROR.into_response(); + }; if let Err(err) = state .nonces .record_response(&payload.request_id, cache_payload) @@ -657,7 +673,241 @@ async fn run_action( ); } - Ok(Json(response)) + Json(response).into_response() +} + +/// Dispatch the action to the EDR, retrying transient failures for the +/// idempotent `/rollback` (Reverse) direction (RB-01 / T64). +/// +/// `/execute` (Apply) dispatches exactly once: re-isolating a host on a blip is +/// the approval layer's call, not this handler's. `/rollback` (Reverse) retries +/// up to `ROLLBACK_MAX_ATTEMPTS` times because contain/lift are idempotent and a +/// transient EDR 5xx or a momentary blip should not page a human when one more +/// quick attempt would succeed. +/// +/// What is retried: only genuinely transient, retry-safe failures, a server +/// 5xx, or an ambiguous transport error where the lift may not have landed. +/// What is NOT retried: a pre-send failure (returned immediately so the nonce +/// can be released and the request retried cleanly from scratch), a 4xx client +/// error (a bad request will not get better by repeating it), and `Unsupported` +/// / `Misconfigured` (decided before any call). The supportability check that +/// rejects unsupported actions is pure and runs inside `dispatch`, so an +/// unsupported action still fails on the first attempt with zero calls. +async fn dispatch_with_retry( + state: &AppState, + payload: &ExecuteRequest, + direction: actions::ActionDirection, +) -> Result<(), edr::EdrError> { + let max_attempts = match direction { + actions::ActionDirection::Apply => 1, + actions::ActionDirection::Reverse => ROLLBACK_MAX_ATTEMPTS, + }; + + let mut attempt = 0; + loop { + attempt += 1; + let result = edr::dispatch( + &state.edr, + payload.edr_credentials.as_ref(), + payload.action_type, + direction, + &payload.host, + ) + .await; + + let err = match result { + Ok(()) => return Ok(()), + Err(err) => err, + }; + + // Stop early when another attempt cannot help: we are out of attempts, + // the failure is not retry-safe (4xx, pre-send, unsupported, + // misconfigured), or this is the Apply direction (max_attempts == 1). + if attempt >= max_attempts || !is_retryable(&err) { + return Err(err); + } + + warn!( + request_id = %payload.request_id, + ?direction, + attempt, + max_attempts, + error = %err, + "rollback dispatch failed transiently; retrying after backoff" + ); + // Linear backoff: attempt 1 waits one base, attempt 2 waits two. Short + // by design, a human approved this rollback and is waiting on it. + tokio::time::sleep(ROLLBACK_RETRY_BACKOFF * attempt).await; + } +} + +/// Whether a dispatch error is worth another attempt for an idempotent action. +/// +/// Only a server 5xx and an ambiguous transport error qualify: both can be a +/// momentary condition that a quick retry of an idempotent contain/lift clears. +/// A 4xx will not improve on repeat, and a pre-send / unsupported / +/// misconfigured error is handled by releasing the nonce (pre-send) or failing +/// loudly, never by retrying. +fn is_retryable(err: &edr::EdrError) -> bool { + matches!( + err, + edr::EdrError::ServerError { .. } | edr::EdrError::Transport(_) + ) +} + +/// Build the failure response (and companion audit entries) after a dispatch +/// failed, honoring the exactly-once and rollback-terminal-state contracts. +/// +/// Three distinct outcomes, by error class and direction: +/// +/// 1. **Unsupported** (`/execute` or `/rollback`): 501 Not Implemented. The +/// action has no faithful provider mapping; never substituted. The nonce is +/// released, a `FAILED_` entry records the intent did not happen. +/// 2. **Pre-send transport failure** (connection refused, DNS): the action +/// provably did NOT run, so we release the nonce so a retry runs cleanly on +/// fresh state, write a `FAILED_` entry, and return 502. +/// 3. **Ambiguous transport failure** (timeout after send, lost response): the +/// action MAY have run. We do NOT release the nonce, a blind retry would risk +/// a double-execute. We write a distinct `NEEDS_RECONCILIATION_` audit entry +/// making clear the action may have executed, and return a body the Python +/// pager keys on (`needs_reconciliation`, `may_have_executed: true`). +/// 4. **Rollback that exhausted its retries** (`/rollback` only, server 5xx or +/// ambiguous after `ROLLBACK_MAX_ATTEMPTS`): a distinct terminal +/// `ROLLBACK_FAILED` audit entry + `rollback_failed` response body the pager +/// keys on. The nonce is released for a 5xx (the lift did not land, a fresh +/// retry is safe) and held for an ambiguous final attempt (it may have). +/// +/// Audit-before-act ordering is preserved: the intent entry was already written +/// in step 5, and every companion entry here is written BEFORE this returns. +async fn handle_dispatch_failure( + state: &AppState, + payload: &ExecuteRequest, + direction: actions::ActionDirection, + audit_action: &str, + err: edr::EdrError, +) -> Response { + let is_rollback = matches!(direction, actions::ActionDirection::Reverse); + + // An action with no faithful provider mapping is 501 and is never retried or + // reconciled: nothing ran, release the nonce and audit the non-event. + if matches!(err, edr::EdrError::Unsupported { .. }) { + warn!( + request_id = %payload.request_id, + ?direction, + error = %err, + "EDR action unsupported; auditing non-event and releasing nonce" + ); + write_failure_entry(state, payload, &format!("FAILED_{audit_action}")).await; + release_nonce(&state.nonces, &payload.request_id).await; + return StatusCode::NOT_IMPLEMENTED.into_response(); + } + + // Did the side effect provably NOT happen? Drives both the nonce-release + // decision (release only when safe) and the audit/response wording. + let safe_to_release = err.side_effect_definitely_did_not_happen(); + + // The terminal label distinguishes a rollback that exhausted its retries + // (the pager treats it as needing human attention) from a one-shot execute + // failure, and an ambiguous "may have executed" from a clean non-event. + let (audit_label, response): (String, Response) = if is_rollback { + // /rollback exhausted ROLLBACK_MAX_ATTEMPTS (or hit a non-retryable + // error). Emit the distinct terminal ROLLBACK_FAILED state the Python + // pager keys on, rather than a generic 502 it has to infer from. + warn!( + request_id = %payload.request_id, + error = %err, + safe_to_release, + "rollback failed after internal retries; emitting terminal ROLLBACK_FAILED" + ); + let body = ActionFailureResponse { + status: ROLLBACK_FAILED_STATUS.to_string(), + may_have_executed: !safe_to_release, + simulated: payload.simulated, + }; + ( + format!("ROLLBACK_FAILED_{audit_action}"), + (StatusCode::BAD_GATEWAY, Json(body)).into_response(), + ) + } else if safe_to_release { + // /execute, pre-send: the isolate provably did not run. A FAILED_ entry + // records the non-event; the nonce is released below so a clean retry + // can run on fresh state. Plain 502. + warn!( + request_id = %payload.request_id, + error = %err, + "EDR execute failed pre-send; auditing non-event and releasing nonce" + ); + ( + format!("FAILED_{audit_action}"), + StatusCode::BAD_GATEWAY.into_response(), + ) + } else { + // /execute, ambiguous: the isolate MAY have run. Hold the nonce so a + // blind retry cannot double-execute, and surface a distinct + // needs-reconciliation state with may_have_executed:true. + warn!( + request_id = %payload.request_id, + error = %err, + "EDR execute transport failure is ambiguous; holding nonce and flagging for \ + human reconciliation (the action MAY have executed)" + ); + let body = ActionFailureResponse { + status: NEEDS_RECONCILIATION_STATUS.to_string(), + may_have_executed: true, + simulated: payload.simulated, + }; + ( + format!("NEEDS_RECONCILIATION_{audit_action}"), + (StatusCode::BAD_GATEWAY, Json(body)).into_response(), + ) + }; + + // Audit the outcome BEFORE returning. The step-5 entry recorded the intent; + // without this companion the trail would read as if the action happened (or + // as if it cleanly failed when it may not have). + write_failure_entry(state, payload, &audit_label).await; + + // Release the nonce ONLY when the side effect provably did not happen. This + // is the exactly-once invariant: an ambiguous failure keeps the claim so the + // next retry is deduped (replayed/InFlight), never re-executed. nonce.rs + // warns to release only when sure the side effect did not happen. + if safe_to_release { + release_nonce(&state.nonces, &payload.request_id).await; + } else { + warn!( + request_id = %payload.request_id, + "NOT releasing nonce: the EDR action may have executed; a retry must be deduped, \ + not re-run, until a human reconciles" + ); + } + + response +} + +/// Append a companion failure/outcome audit entry (best-effort). +/// +/// The action-type label (`FAILED_...`, `ROLLBACK_FAILED_...`, +/// `NEEDS_RECONCILIATION_...`) names what happened so the trail never reads as a +/// clean success. A write error is logged, not swallowed into a fake success and +/// not propagated, the caller is already returning a failure status. +async fn write_failure_entry(state: &AppState, payload: &ExecuteRequest, audit_label: &str) { + let entry = audit::build_entry( + payload.tenant_id.clone(), + audit_label.to_string(), + payload.host.clone(), + payload.approved_by.clone(), + payload.simulated, + ); + if let Err(audit_err) = + audit::append_audit(&state.audit_log_path, &state.audit_chain, entry).await + { + warn!( + request_id = %payload.request_id, + error = %audit_err, + audit_label, + "failed to write failure audit entry" + ); + } } /// `GET /audit/export?tenant_id=` @@ -846,6 +1096,27 @@ fn resolve_proxy_secret(is_production: bool) -> String { ); } +/// Decide whether a signing secret is too weak to serve production traffic +/// (PRX-08). +/// +/// Returns `Some(message)` when the boot must fail closed: a secret shorter than +/// `MIN_SECRET_LEN` (256 bits) in production. Returns `None` otherwise, including +/// for a short secret in dev/CI (the caller warns instead). Extracted as a pure +/// function so the fail-closed rule is unit-testable without booting the async +/// runtime, mirroring `resolve_proxy_secret`. +fn secret_strength_error(secret: &str, is_production: bool) -> Option { + if is_production && secret.len() < MIN_SECRET_LEN { + return Some(format!( + "proxy signing secret is only {} bytes; refusing to start in production with a \ + signing secret shorter than {MIN_SECRET_LEN} bytes (256 bits), which weakens the \ + HMAC that authorizes every EDR action. Set a >= {MIN_SECRET_LEN}-byte \ + VYROX_PROXY_SECRET (PRX-08).", + secret.len() + )); + } + None +} + /// Decide whether this is a production boot. /// /// The proxy has no first-class environment field, so it reads the same @@ -989,8 +1260,21 @@ async fn run() { // disable auth. (Per-tenant HKDF key derivation is deferred to keep this // wave in lockstep with the Python side.) let hmac_secret = resolve_proxy_secret(is_production); - if hmac_secret.len() < 32 { - warn!("proxy signing secret is shorter than 32 bytes; consider rotating to a longer key"); + // PRX-08: fail closed on a weak signing secret in production (see + // `secret_strength_error`). The check is extracted so it is unit-testable + // without booting the runtime; here we turn its verdict into a panic (prod) + // or a warning (dev/CI). + match secret_strength_error(&hmac_secret, is_production) { + Some(message) => panic!("{message}"), + None if hmac_secret.len() < MIN_SECRET_LEN => { + warn!( + secret_len = hmac_secret.len(), + min_len = MIN_SECRET_LEN, + "proxy signing secret is shorter than the minimum; this fails closed in \ + production. Rotate to a longer key (PRX-08)." + ); + } + None => {} } let audit_log_path = env::var("AUDIT_LOG_PATH").unwrap_or_else(|_| "./audit.jsonl".to_string()); @@ -1935,11 +2219,30 @@ mod tests { #[tokio::test] async fn rollback_against_failing_edr_is_bad_gateway() { - // A mock EDR that always 500s on the action endpoint. The proxy must - // surface that as 502 BAD_GATEWAY (the Python side maps that to a - // paged ROLLBACK_FAILED), never a silent success. - let state: MockEdrState = Arc::new(StdMutex::new(HashMap::new())); + // A mock EDR that always 500s on the action endpoint. After the bounded + // internal retries (RB-01 / T64) the proxy must surface that as 502 + // BAD_GATEWAY with the distinct terminal ROLLBACK_FAILED body the Python + // pager keys on, never a silent success. + let (router, _dir) = always_500_crowdstrike_router("rb-fail").await; + let creds = json!({ + "provider": "crowdstrike", + "api_key": "id", + "api_secret": "secret", + "base_url": router.1, + }); + let body = action_body("e2e-rb-fail", Some(creds)); + let (status, value) = post_json(&router.0, "/rollback", body).await; + assert_eq!(status, StatusCode::BAD_GATEWAY); + assert_eq!( + value["status"], ROLLBACK_FAILED_STATUS, + "a rollback that exhausts its retries must report the terminal ROLLBACK_FAILED state" + ); + } + /// Build a router plus a CrowdStrike-shaped mock whose token endpoint works + /// but whose device-action endpoint ALWAYS 500s, for the RB-01 terminal-state + /// tests. Returns (router, base_url) and a tempdir kept alive by the caller. + async fn always_500_crowdstrike_router(_tag: &str) -> ((Router, String), TempDir) { async fn token() -> Json { Json(json!({"access_token": "t", "expires_in": 1800})) } @@ -1952,7 +2255,127 @@ mod tests { "/devices/entities/devices-actions/v2", axum::routing::post(always_500), ) - .with_state(state); + .with_state(()); + let listener = TcpListener::bind("127.0.0.1:0").await.expect("bind"); + let addr: SocketAddr = listener.local_addr().expect("addr"); + tokio::spawn(async move { + axum::serve(listener, app).await.expect("serve"); + }); + let (router, dir) = test_router(edr::EdrClient::Noop); + ((router, format!("http://{addr}")), dir) + } + + // ── T63 / CNT-05: pre-send vs ambiguous transport failure ────────────── + + #[tokio::test] + async fn execute_presend_failure_releases_nonce_for_clean_retry() { + // A connection refused on /execute is a PRE-SEND failure: the isolate + // provably did not run, so the nonce is released and a retry with the + // SAME request_id re-dispatches (502 again) rather than being deduped to + // 409. If the nonce had been wrongly held, the retry would be 409. + let (router, dir) = test_router(edr::EdrClient::Noop); + let creds = json!({ + "provider": "crowdstrike", + "api_key": "id", + "api_secret": "secret", + "base_url": "http://127.0.0.1:1", // nothing listening -> connect refused + }); + let body = action_body("presend-1", Some(creds.clone())); + let (s1, _) = post_json(&router, "/execute", body).await; + assert_eq!(s1, StatusCode::BAD_GATEWAY, "pre-send failure is 502"); + + // Same request_id again: a released nonce means a FRESH claim and another + // real attempt (502), NOT a 409 in-flight. + let body2 = action_body("presend-1", Some(creds)); + let (s2, _) = post_json(&router, "/execute", body2).await; + assert_eq!( + s2, + StatusCode::BAD_GATEWAY, + "a pre-send failure must release the nonce so the retry re-dispatches, not 409" + ); + + // The trail records the non-event as FAILED_, never NEEDS_RECONCILIATION_. + let log = std::fs::read_to_string(dir.path().join("audit.jsonl")).expect("audit log"); + assert!(log.contains("\"FAILED_HostIsolation\"")); + assert!(!log.contains("NEEDS_RECONCILIATION")); + } + + #[tokio::test] + async fn execute_ambiguous_failure_holds_nonce_and_flags_reconciliation() { + // A 5xx from the EDR on /execute is AMBIGUOUS: the EDR was contacted and + // may have acted. The proxy must NOT release the nonce (a blind retry + // would risk a double-isolate) and must surface the distinct + // needs_reconciliation state with may_have_executed:true. A retry with + // the same request_id is then deduped to 409 (the claim is still + // in-flight), proving the nonce was held. + let ((router, base_url), dir) = always_500_crowdstrike_router("amb").await; + let creds = json!({ + "provider": "crowdstrike", + "api_key": "id", + "api_secret": "secret", + "base_url": base_url, + }); + let body = action_body("ambiguous-1", Some(creds.clone())); + let (s1, v1) = post_json(&router, "/execute", body).await; + assert_eq!(s1, StatusCode::BAD_GATEWAY); + assert_eq!( + v1["status"], NEEDS_RECONCILIATION_STATUS, + "an ambiguous execute failure must flag for human reconciliation" + ); + assert_eq!( + v1["may_have_executed"], true, + "the response must warn the action may have executed" + ); + + // Same request_id again: the nonce was held (in-flight), so this dedups + // to 409 rather than re-executing. + let body2 = action_body("ambiguous-1", Some(creds)); + let s2 = post(&router, "/execute", body2.clone(), Some(sign_body(&body2))).await; + assert_eq!( + s2, + StatusCode::CONFLICT, + "an ambiguous failure must HOLD the nonce: the retry is deduped, not re-run" + ); + + // The trail records the ambiguity, not a clean failure. + let log = std::fs::read_to_string(dir.path().join("audit.jsonl")).expect("audit log"); + assert!( + log.contains("\"NEEDS_RECONCILIATION_HostIsolation\""), + "the audit entry must make clear the action may have executed" + ); + } + + // ── T64 / RB-01: bounded rollback retry + terminal ROLLBACK_FAILED ───── + + #[tokio::test] + async fn rollback_retries_transient_failure_then_succeeds() { + // A mock that 500s on the FIRST device-action hit and 200s afterward. + // The bounded internal retry must absorb the transient failure and the + // rollback must ultimately succeed (200 rolled_back), with no human page. + let attempt = Arc::new(StdMutex::new(0u32)); + + async fn token() -> Json { + Json(json!({"access_token": "t", "expires_in": 1800})) + } + async fn flaky( + State(attempt): State>>, + ) -> Result, StatusCode> { + let mut n = attempt.lock().unwrap(); + *n += 1; + if *n == 1 { + // First attempt: transient server error. + Err(StatusCode::INTERNAL_SERVER_ERROR) + } else { + Ok(Json(json!({ "resources": [], "errors": [] }))) + } + } + let app = Router::new() + .route("/oauth2/token", axum::routing::post(token)) + .route( + "/devices/entities/devices-actions/v2", + axum::routing::post(flaky), + ) + .with_state(attempt.clone()); let listener = TcpListener::bind("127.0.0.1:0").await.expect("bind"); let addr: SocketAddr = listener.local_addr().expect("addr"); tokio::spawn(async move { @@ -1966,9 +2389,84 @@ mod tests { "api_secret": "secret", "base_url": format!("http://{addr}"), }); - let body = action_body("e2e-rb-fail", Some(creds)); - let sig = sign_body(&body); - let status = post(&router, "/rollback", body, Some(sig)).await; + let body = action_body("rb-retry-ok", Some(creds)); + let (status, value) = post_json(&router, "/rollback", body).await; + assert_eq!( + status, + StatusCode::OK, + "the bounded retry must recover a transient rollback failure" + ); + assert_eq!(value["status"], "rolled_back"); + assert!( + *attempt.lock().unwrap() >= 2, + "the rollback must have retried at least once" + ); + } + + #[tokio::test] + async fn rollback_exhausts_retries_then_emits_terminal_rollback_failed() { + // A device-action that ALWAYS 500s. After ROLLBACK_MAX_ATTEMPTS the proxy + // emits the distinct terminal ROLLBACK_FAILED audit entry + body. A 5xx + // is AMBIGUOUS about whether the EDR partially applied the lift, so the + // nonce is HELD (may_have_executed:true) rather than released: the human + // pager reconciles, and a blind retry is deduped to 409 instead of + // risking a second uncoordinated lift. + let ((router, base_url), dir) = always_500_crowdstrike_router("rb-exhaust").await; + let creds = json!({ + "provider": "crowdstrike", + "api_key": "id", + "api_secret": "secret", + "base_url": base_url, + }); + let body = action_body("rb-exhaust-1", Some(creds.clone())); + let (status, value) = post_json(&router, "/rollback", body).await; assert_eq!(status, StatusCode::BAD_GATEWAY); + assert_eq!(value["status"], ROLLBACK_FAILED_STATUS); + assert_eq!( + value["may_have_executed"], true, + "a server 5xx is ambiguous: the EDR may have partially applied the lift" + ); + + // The terminal state is in the audit trail under a distinct label. + let log = std::fs::read_to_string(dir.path().join("audit.jsonl")).expect("audit log"); + assert!( + log.contains("\"ROLLBACK_FAILED_ROLLBACK_HostIsolation\""), + "the trail must carry the distinct terminal ROLLBACK_FAILED label, saw: {log}" + ); + + // The ambiguous failure HELD the nonce: a retry is deduped to 409, never + // a second uncoordinated lift, until a human reconciles. + let body2 = action_body("rb-exhaust-1", Some(creds)); + let s2 = post(&router, "/rollback", body2.clone(), Some(sign_body(&body2))).await; + assert_eq!( + s2, + StatusCode::CONFLICT, + "an ambiguous rollback failure holds the nonce: the retry is deduped, not re-run" + ); + } + + // ── PRX-08: weak signing secret fails closed in production ───────────── + + #[test] + fn secret_strength_error_fails_closed_only_in_production() { + let short = "too-short"; // < 32 bytes + let long = "this-secret-is-definitely-at-least-32-bytes-long"; // >= 32 + + // Production + short secret -> hard error (the caller panics on this). + assert!( + secret_strength_error(short, true).is_some(), + "a short secret must fail closed in production" + ); + // Production + long secret -> fine. + assert!( + secret_strength_error(long, true).is_none(), + "a >= 32-byte secret is accepted in production" + ); + // Dev/CI + short secret -> no hard error (the caller only warns). + assert!( + secret_strength_error(short, false).is_none(), + "a short secret only warns in dev/CI, never blocks" + ); + assert!(secret_strength_error(long, false).is_none()); } }