From 74e5cab25590e67d9ddcc608841a363893555632 Mon Sep 17 00:00:00 2001
From: keirsalterego <darksalterego99@gmail.com>
Date: Mon, 15 Jun 2026 11:51:07 +0530
Subject: [PATCH] fix(proxy): exactly-once on ambiguous EDR failure, rollback
 retry + terminal state

Four production-grade fixes in the containment proxy, each confirmed by the
prior audit.

T63 / CNT-05 (P1): the nonce was released on ANY EdrError, including a
transport drop AFTER the request reached the EDR, which could double-execute a
containment on retry. EdrError now distinguishes a pre-send transport failure
(connection refused, DNS: the action provably did NOT run, safe to release the
nonce) from an ambiguous one (timeout after send, lost response: the action MAY
have run). reqwest errors are classified at the dispatch boundary via
is_connect(). On /execute an ambiguous failure now HOLDS the nonce and returns a
distinct needs_reconciliation state (may_have_executed:true) plus an audit entry
that makes clear the action may have executed; only a provable non-event
releases the nonce for a clean retry.

T64 / RB-01 (P1): a transient EDR 5xx / ambiguous timeout on /rollback used to
return a generic 502 and lean entirely on the Python pager. /rollback now
retries the idempotent contain/lift up to ROLLBACK_MAX_ATTEMPTS with a short
linear backoff (4xx and pre-send are not retried), and on exhaustion emits a
distinct terminal ROLLBACK_FAILED audit entry + response body the pager keys on.
Audit-before-act ordering is preserved.

T79 PRX-08 (P3): the proxy booted on a signing secret < 32 bytes with only a
warning. It now fails closed (panic) in production, mirroring the no-secret
panic, and keeps the warning for dev/CI. The check is extracted as a pure,
unit-tested function.

T79 PRX-07 (P3): the per-tenant path rebuilt a fresh CrowdstrikeClient per
request, so the per-instance token cache never helped and every action paid a
full OAuth round-trip. Tokens are now cached process-wide, keyed by credential
identity (base_url + client_id + a hash of the secret, never the plaintext
secret), so consecutive actions for the same tenant reuse a valid bearer while
tenants stay isolated.

Adds tests for the pre-send vs ambiguous classification, the nonce
hold/release decision, the rollback retry + terminal ROLLBACK_FAILED path, the
PRX-08 fail-closed rule, and the PRX-07 cross-request token reuse + isolation.
---
 src/edr.rs  | 418 ++++++++++++++++++++++++++++--
 src/main.rs | 718 ++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 998 insertions(+), 138 deletions(-)

diff --git a/src/edr.rs b/src/edr.rs
index aafd10d..65f4962 100644
--- a/src/edr.rs
+++ b/src/edr.rs
@@ -72,13 +72,14 @@
 //! so a retry can run on fresh state.
 
 use std::env;
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};
 use std::time::{Duration, Instant};
 
+use dashmap::DashMap;
 use reqwest::Client;
 use serde::{Deserialize, Serialize};
+use sha2::{Digest, Sha256};
 use thiserror::Error;
-use tokio::sync::Mutex;
 use tracing::info;
 
 use crate::actions::{ActionDirection, ActionType};
@@ -86,6 +87,29 @@ use crate::actions::{ActionDirection, ActionType};
 /// Default HTTP timeout for EDR calls, in seconds.
 const DEFAULT_TIMEOUT_SECS: u64 = 30;
 
+/// Process-wide CrowdStrike OAuth bearer cache, keyed by credential identity
+/// (PRX-07).
+///
+/// The per-tenant dispatch path builds a fresh `CrowdstrikeClient` for every
+/// request (so tenant A always acts with tenant A's key), which meant the old
+/// per-instance token cache never survived a single request: every action paid
+/// a full OAuth round-trip. This shared cache lives for the life of the process
+/// and is keyed by a stable, non-secret identity (`base_url` + `client_id` + a
+/// hash of the secret), so consecutive actions for the same tenant reuse a valid
+/// bearer instead of re-authenticating.
+///
+/// Keying on a credential identity rather than per-client keeps tenants
+/// isolated: tenant A's entry can never serve tenant B a token, and a rotated
+/// secret produces a different key (the old entry simply expires out). The
+/// cached value is a short-lived bearer + a refresh deadline; secrets are never
+/// stored as map keys (only a hash) and the cache is never logged.
+static CROWDSTRIKE_TOKEN_CACHE: OnceLock<DashMap<String, CachedToken>> = OnceLock::new();
+
+/// Accessor for the lazily-initialised shared token cache.
+fn token_cache() -> &'static DashMap<String, CachedToken> {
+    CROWDSTRIKE_TOKEN_CACHE.get_or_init(DashMap::new)
+}
+
 /// Errors that can happen during an EDR dispatch.
 ///
 /// Callers treat the transport-shaped variants the same on the wire (502
@@ -105,8 +129,19 @@ pub enum EdrError {
     #[error("edr returned server error {status}: {body}")]
     ServerError { status: u16, body: String },
 
-    /// Transport error (DNS, TCP, TLS, timeout).
-    #[error("edr transport error: {0}")]
+    /// Transport error that definitively happened BEFORE the request bytes
+    /// reached the EDR: connection refused, DNS failure, TLS handshake failure.
+    /// The side effect provably did NOT happen, so the caller can safely
+    /// release the nonce and allow a clean retry (CNT-05 / T63).
+    #[error("edr transport error (pre-send): {0}")]
+    TransportPreSend(String),
+
+    /// Transport error that is AMBIGUOUS about whether the EDR acted: a timeout
+    /// after the request was sent, a dropped/lost response, a body-read error.
+    /// The action MAY have executed, so the caller must NOT release the nonce
+    /// (releasing would let a retry double-execute the containment). Surfaced
+    /// as a needs-reconciliation state instead (CNT-05 / T63).
+    #[error("edr transport error (ambiguous, may have executed): {0}")]
     Transport(String),
 
     /// EDR returned a successful HTTP status but the response body
@@ -133,6 +168,56 @@ pub enum EdrError {
     },
 }
 
+impl EdrError {
+    /// True when this error proves the EDR side effect did NOT happen, so the
+    /// caller may safely release the nonce and allow a clean retry (CNT-05).
+    ///
+    /// Only `TransportPreSend` (connection refused, DNS, TLS handshake) and a
+    /// pre-send config failure (`Misconfigured`, `Unsupported`) qualify: in all
+    /// of those no request bytes reached the EDR. `Transport` (ambiguous), a
+    /// 4xx/5xx the EDR returned, and an unexpected response all leave open the
+    /// possibility the action executed, so they are NOT release-safe.
+    ///
+    /// `Misconfigured`/`Unsupported` are listed here for completeness, but the
+    /// handler short-circuits both before dispatch, so in practice only the two
+    /// transport variants drive the release decision at the dispatch boundary.
+    pub fn side_effect_definitely_did_not_happen(&self) -> bool {
+        matches!(
+            self,
+            EdrError::TransportPreSend(_)
+                | EdrError::Misconfigured(_)
+                | EdrError::Unsupported { .. }
+        )
+    }
+}
+
+/// Classify a reqwest error from a `.send()` call into a pre-send (safe to
+/// release the nonce) or ambiguous (the action may have executed) transport
+/// error (CNT-05 / T63).
+///
+/// The distinction is which side of "the request reached the EDR" the failure
+/// fell on:
+///
+/// - `is_connect()` means the TCP/TLS connection was never established, so the
+///   request bytes never left for the EDR. Provably no side effect: pre-send.
+/// - A timeout (`is_timeout()`), a lost/dropped response, or a body-read error
+///   can all fire AFTER the request was written to the socket, when the EDR may
+///   already have applied the action. Ambiguous: we must not release the nonce.
+///
+/// reqwest collapses DNS resolution failures into the connect phase, so they
+/// land in `is_connect()` and are correctly treated as pre-send. We bias the
+/// unknown/uncategorised case toward ambiguous (the conservative direction): a
+/// misclassified ambiguous error costs a human reconciliation, while a
+/// misclassified pre-send error costs a double-execute, which is the outcome
+/// this whole change exists to prevent.
+fn classify_transport_error(err: &reqwest::Error) -> EdrError {
+    if err.is_connect() {
+        EdrError::TransportPreSend(err.to_string())
+    } else {
+        EdrError::Transport(err.to_string())
+    }
+}
+
 /// Which EDR a per-tenant credential targets.
 ///
 /// Carried on `EdrCredentials` so the central proxy can serve tenants on
@@ -373,16 +458,21 @@ pub fn check_supported(
 
 /// CrowdStrike Falcon Real Time Response client.
 ///
-/// Holds a `reqwest::Client` and a mutex-guarded token cache. We use a
-/// single Mutex around the cache (rather than per-field locks) because
-/// the critical section is microseconds long and contention is bounded
-/// by request rate (low for human-approved actions).
+/// Holds a `reqwest::Client` and a `cache_key` into the process-wide
+/// `CROWDSTRIKE_TOKEN_CACHE` (PRX-07). The token cache is shared across requests
+/// rather than per-instance, because the per-tenant dispatch path builds a fresh
+/// client per request: a per-instance cache would be discarded immediately and
+/// every action would re-authenticate. Keying the shared cache by credential
+/// identity reuses a valid bearer across a tenant's consecutive actions while
+/// keeping tenants isolated.
 pub struct CrowdstrikeClient {
     http: Client,
     base_url: String,
     client_id: String,
     client_secret: String,
-    token: Mutex<Option<CachedToken>>,
+    /// Stable, non-secret key into the shared token cache. Derived from
+    /// `base_url` + `client_id` + a hash of the secret (see `token_cache_key`).
+    cache_key: String,
 }
 
 #[derive(Clone)]
@@ -392,6 +482,20 @@ struct CachedToken {
     refresh_at: Instant,
 }
 
+/// Build the shared-cache key for a CrowdStrike credential.
+///
+/// Combines `base_url` and `client_id` (the credential's identity) with a
+/// SHA-256 hash of the secret, so a rotated secret keys to a different entry
+/// (the stale token is never reused) without ever putting the plaintext secret
+/// in a map key. Two requests for the same tenant credential produce the same
+/// key and therefore share a bearer.
+fn token_cache_key(base_url: &str, client_id: &str, client_secret: &str) -> String {
+    let mut hasher = Sha256::new();
+    hasher.update(client_secret.as_bytes());
+    let secret_hash = hex::encode(hasher.finalize());
+    format!("{base_url}\u{0}{client_id}\u{0}{secret_hash}")
+}
+
 /// Shape of the OAuth2 token response from
 /// `POST /oauth2/token`. We only keep the fields we use.
 #[derive(Deserialize)]
@@ -432,26 +536,33 @@ impl CrowdstrikeClient {
             .build()
             .map_err(|e| EdrError::Misconfigured(format!("http client build failed: {e}")))?;
 
+        let cache_key = token_cache_key(&base_url, &client_id, &client_secret);
         Ok(Self {
             http,
             base_url,
             client_id,
             client_secret,
-            token: Mutex::new(None),
+            cache_key,
         })
     }
 
     /// Get a bearer token, refreshing if necessary.
     ///
-    /// Concurrency: only one task can hold the lock at a time, so
-    /// concurrent first-callers serialize on the token fetch. After the
-    /// first fetch, all callers within the cache window get the cached value
-    /// without re-fetching. A per-request client (the per-tenant path) lives
-    /// for one action, so it fetches once and is dropped.
+    /// Reads from and writes to the process-wide `CROWDSTRIKE_TOKEN_CACHE`
+    /// (PRX-07), keyed by `self.cache_key` (credential identity). Consecutive
+    /// actions for the same tenant therefore reuse a valid bearer across
+    /// requests, even though each per-tenant request builds a fresh client. A
+    /// cache hit returns with zero network calls; a miss or expired entry
+    /// fetches once and writes the new token back for the next request.
+    ///
+    /// Concurrency: a brief race where two first-callers both miss the cache and
+    /// both fetch is harmless, both write a valid bearer and the last writer
+    /// wins. We do not hold a lock across the network fetch (DashMap entries are
+    /// not held across the await), so a slow CrowdStrike never serializes every
+    /// other tenant's token fetch behind it.
     async fn bearer_token(&self) -> Result<String, EdrError> {
-        let mut guard = self.token.lock().await;
-
-        if let Some(cached) = guard.as_ref() {
+        // Fast path: a live cached bearer for this exact credential identity.
+        if let Some(cached) = token_cache().get(&self.cache_key) {
             if Instant::now() < cached.refresh_at {
                 return Ok(cached.bearer.clone());
             }
@@ -459,6 +570,12 @@ impl CrowdstrikeClient {
 
         // Need a fresh token.
         let url = format!("{}/oauth2/token", self.base_url);
+        // A failed token fetch is ALWAYS release-safe regardless of pre/post
+        // send: the containment device-action call is never made without a
+        // bearer, so the action provably did not happen. We classify a connect
+        // failure as pre-send and map any other token-fetch transport error to
+        // pre-send too, because the only side effect that matters (the action)
+        // could not have run. The token endpoint itself is read-only.
         let resp = self
             .http
             .post(&url)
@@ -468,7 +585,7 @@ impl CrowdstrikeClient {
             ])
             .send()
             .await
-            .map_err(|e| EdrError::Transport(e.to_string()))?;
+            .map_err(|e| EdrError::TransportPreSend(e.to_string()))?;
 
         let status = resp.status();
         if !status.is_success() {
@@ -488,11 +605,13 @@ impl CrowdstrikeClient {
         // CrowdStrike, and to avoid the cliff where the token expires
         // mid-flight.
         let lifetime = Duration::from_secs((parsed.expires_in * 8) / 10);
-        let new = CachedToken {
-            bearer: parsed.access_token.clone(),
-            refresh_at: Instant::now() + lifetime,
-        };
-        *guard = Some(new);
+        token_cache().insert(
+            self.cache_key.clone(),
+            CachedToken {
+                bearer: parsed.access_token.clone(),
+                refresh_at: Instant::now() + lifetime,
+            },
+        );
 
         Ok(parsed.access_token)
     }
@@ -524,6 +643,12 @@ impl CrowdstrikeClient {
             ids: vec![host],
         };
 
+        // This is THE side-effecting call. Classify a transport failure here as
+        // pre-send (connection refused / DNS: the action never left) vs
+        // ambiguous (timeout after send / lost response: the action may have
+        // run). The nonce-release decision in the handler keys off that
+        // distinction so an ambiguous failure never lets a retry double-isolate
+        // a host (CNT-05 / T63).
         let resp = self
             .http
             .post(&url)
@@ -531,7 +656,7 @@ impl CrowdstrikeClient {
             .json(&body)
             .send()
             .await
-            .map_err(|e| EdrError::Transport(e.to_string()))?;
+            .map_err(|e| classify_transport_error(&e))?;
 
         let status = resp.status();
         if status.is_success() {
@@ -653,6 +778,9 @@ impl SentinelOneClient {
             filter: S1Filter { uuids: vec![host] },
         };
 
+        // THE side-effecting call. Same pre-send vs ambiguous classification as
+        // CrowdStrike so the handler's nonce-release decision is correct for S1
+        // too (CNT-05 / T63).
         let resp = self
             .http
             .post(&url)
@@ -660,7 +788,7 @@ impl SentinelOneClient {
             .json(&body)
             .send()
             .await
-            .map_err(|e| EdrError::Transport(e.to_string()))?;
+            .map_err(|e| classify_transport_error(&e))?;
 
         let status = resp.status();
         if status.is_success() {
@@ -981,6 +1109,10 @@ mod tests {
         // points CrowdStrike at an unreachable base_url, so a transport
         // error proves the per-tenant client ran INSTEAD of the noop
         // fallback. If the fallback had been used we would get Ok.
+        //
+        // The failure is a connection refused on the OAuth token fetch, i.e. a
+        // pre-send transport error: no device-action call was ever made, so the
+        // action provably did not happen and the nonce is release-safe (CNT-05).
         let fallback = EdrClient::Noop;
         let creds = cs_creds();
         let err = dispatch(
@@ -993,11 +1125,88 @@ mod tests {
         .await
         .expect_err("per-tenant CrowdStrike client should attempt a real call and fail transport");
         assert!(
-            matches!(err, EdrError::Transport(_)),
-            "expected transport error from per-tenant client, got {err:?}"
+            matches!(err, EdrError::TransportPreSend(_)),
+            "expected a pre-send transport error from per-tenant client, got {err:?}"
+        );
+        assert!(
+            err.side_effect_definitely_did_not_happen(),
+            "connection refused before the action call means the side effect did not happen"
+        );
+    }
+
+    #[tokio::test]
+    async fn classify_connect_failure_is_pre_send_and_release_safe() {
+        // A connection refused (nothing listening) is reqwest::Error::is_connect:
+        // the request bytes never left, so it must classify as pre-send and be
+        // release-safe (CNT-05 / T63). We provoke a genuine connect failure by
+        // posting to a closed loopback port with the async client.
+        let client = Client::builder()
+            .timeout(Duration::from_secs(2))
+            .build()
+            .expect("client");
+        let err = client
+            .post("http://127.0.0.1:1/oauth2/token")
+            .send()
+            .await
+            .expect_err("connection to a closed port must fail");
+        assert!(
+            err.is_connect(),
+            "sanity: a closed-port failure should be a connect error, got {err:?}"
+        );
+        let classified = classify_transport_error(&err);
+        assert!(
+            matches!(classified, EdrError::TransportPreSend(_)),
+            "a connect failure must classify as pre-send, got {classified:?}"
+        );
+        assert!(
+            classified.side_effect_definitely_did_not_happen(),
+            "a pre-send transport error must be release-safe"
+        );
+    }
+
+    #[test]
+    fn ambiguous_transport_is_not_release_safe() {
+        // An ambiguous transport error (timeout after send / lost response) MUST
+        // NOT be release-safe: the action may have executed, so releasing the
+        // nonce would let a retry double-execute (CNT-05 / T63).
+        let err = EdrError::Transport("timed out reading response".to_string());
+        assert!(
+            !err.side_effect_definitely_did_not_happen(),
+            "an ambiguous transport error must not be treated as release-safe"
         );
     }
 
+    #[test]
+    fn server_and_client_errors_are_not_release_safe() {
+        // A 4xx/5xx the EDR returned means it WAS contacted; the action may have
+        // taken effect even on a 5xx. Neither is release-safe.
+        let server = EdrError::ServerError {
+            status: 503,
+            body: String::new(),
+        };
+        let client_err = EdrError::ClientError {
+            status: 429,
+            body: String::new(),
+        };
+        assert!(!server.side_effect_definitely_did_not_happen());
+        assert!(!client_err.side_effect_definitely_did_not_happen());
+    }
+
+    #[test]
+    fn unsupported_and_misconfigured_are_release_safe_pre_dispatch() {
+        // Both are decided before any network call, so they are release-safe by
+        // construction (the handler short-circuits Unsupported separately, but
+        // the predicate must still report the truth).
+        let unsupported = EdrError::Unsupported {
+            action: ActionType::ProcessKill,
+            provider: "crowdstrike",
+            detail: "no mapping",
+        };
+        let misconfigured = EdrError::Misconfigured("no base_url".to_string());
+        assert!(unsupported.side_effect_definitely_did_not_happen());
+        assert!(misconfigured.side_effect_definitely_did_not_happen());
+    }
+
     #[tokio::test]
     async fn unusable_credential_falls_back_to_global() {
         // A credential whose secret is blank is NOT usable, so dispatch must
@@ -1140,4 +1349,157 @@ mod tests {
             "S1 must receive exactly the approved actions, in order, and nothing for PROCESS_KILL"
         );
     }
+
+    // ── CrowdStrike token cache (PRX-07) ───────────────────────────────────
+    //
+    // The shared `CROWDSTRIKE_TOKEN_CACHE` must let consecutive actions for the
+    // same tenant credential reuse a bearer ACROSS requests, even though each
+    // per-tenant request builds a fresh CrowdstrikeClient. A different
+    // credential must NOT reuse another's token.
+
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    /// A CrowdStrike-shaped mock that counts how many times `/oauth2/token` is
+    /// hit, so a test can prove the second action reused the cached bearer
+    /// instead of re-authenticating.
+    async fn spawn_counting_mock_crowdstrike() -> (String, Arc<AtomicUsize>) {
+        use axum::extract::State;
+        use axum::response::Json;
+        use axum::Router;
+
+        let token_hits = Arc::new(AtomicUsize::new(0));
+
+        async fn token(State(hits): State<Arc<AtomicUsize>>) -> Json<serde_json::Value> {
+            hits.fetch_add(1, Ordering::SeqCst);
+            Json(serde_json::json!({
+                "access_token": "mock-bearer",
+                "token_type": "bearer",
+                "expires_in": 1800,
+            }))
+        }
+
+        async fn device_action() -> Json<serde_json::Value> {
+            Json(serde_json::json!({ "resources": [], "errors": [] }))
+        }
+
+        let app = Router::new()
+            .route("/oauth2/token", axum::routing::post(token))
+            .route(
+                "/devices/entities/devices-actions/v2",
+                axum::routing::post(device_action),
+            )
+            .with_state(token_hits.clone());
+
+        let listener = tokio::net::TcpListener::bind("127.0.0.1:0")
+            .await
+            .expect("bind mock cs");
+        let addr: SocketAddr = listener.local_addr().expect("mock cs addr");
+        tokio::spawn(async move {
+            axum::serve(listener, app).await.expect("mock cs serve");
+        });
+        (format!("http://{addr}"), token_hits)
+    }
+
+    #[tokio::test]
+    async fn crowdstrike_token_is_cached_across_requests_for_same_credential() {
+        // Two actions for the SAME tenant credential, each through a fresh
+        // CrowdstrikeClient (built per dispatch). The first fetches a token, the
+        // second must reuse it from the shared cache: exactly one token fetch
+        // for two actions. This is the PRX-07 fix; before it, every action paid
+        // a full OAuth round-trip.
+        let (base_url, token_hits) = spawn_counting_mock_crowdstrike().await;
+        let fallback = EdrClient::Noop;
+        // Unique client_id so this test's cache entry never collides with
+        // another test's (the cache is process-global).
+        let client_id = format!("tenant-cache-{}", std::process::id());
+        let creds = EdrCredentials {
+            provider: EdrProvider::Crowdstrike,
+            api_key: client_id,
+            api_secret: Some("secret-1".to_string()),
+            base_url: Some(base_url),
+        };
+
+        for dir in [ActionDirection::Apply, ActionDirection::Reverse] {
+            dispatch(
+                &fallback,
+                Some(&creds),
+                ActionType::HostIsolation,
+                dir,
+                "device-1",
+            )
+            .await
+            .expect("dispatch should succeed against the mock");
+        }
+
+        assert_eq!(
+            token_hits.load(Ordering::SeqCst),
+            1,
+            "the second action must reuse the cached bearer: exactly one OAuth fetch for two \
+             actions (PRX-07)"
+        );
+    }
+
+    #[tokio::test]
+    async fn crowdstrike_token_cache_does_not_leak_across_distinct_credentials() {
+        // Two DIFFERENT credentials (distinct secret -> distinct cache key) must
+        // each fetch their own token: a tenant never serves another tenant a
+        // bearer from the shared cache.
+        let (base_url, token_hits) = spawn_counting_mock_crowdstrike().await;
+        let fallback = EdrClient::Noop;
+        let id = format!("tenant-isolate-{}", std::process::id());
+
+        let creds_a = EdrCredentials {
+            provider: EdrProvider::Crowdstrike,
+            api_key: id.clone(),
+            api_secret: Some("secret-A".to_string()),
+            base_url: Some(base_url.clone()),
+        };
+        let creds_b = EdrCredentials {
+            provider: EdrProvider::Crowdstrike,
+            api_key: id,
+            api_secret: Some("secret-B".to_string()),
+            base_url: Some(base_url),
+        };
+
+        dispatch(
+            &fallback,
+            Some(&creds_a),
+            ActionType::HostIsolation,
+            ActionDirection::Apply,
+            "device-1",
+        )
+        .await
+        .expect("creds A dispatch");
+        dispatch(
+            &fallback,
+            Some(&creds_b),
+            ActionType::HostIsolation,
+            ActionDirection::Apply,
+            "device-1",
+        )
+        .await
+        .expect("creds B dispatch");
+
+        assert_eq!(
+            token_hits.load(Ordering::SeqCst),
+            2,
+            "distinct credentials must each fetch their own token; no cross-credential reuse"
+        );
+    }
+
+    #[test]
+    fn token_cache_key_is_stable_and_secret_sensitive() {
+        // Same identity -> same key (so the cache hits). Different secret ->
+        // different key (so a rotated secret never reuses a stale token). The
+        // plaintext secret must NOT appear in the key.
+        let k1 = token_cache_key("https://api.example", "id-1", "secret-xyz");
+        let k2 = token_cache_key("https://api.example", "id-1", "secret-xyz");
+        let k3 = token_cache_key("https://api.example", "id-1", "rotated-secret");
+        assert_eq!(k1, k2, "same credential identity must produce a stable key");
+        assert_ne!(k1, k3, "a rotated secret must produce a different key");
+        assert!(
+            !k1.contains("secret-xyz"),
+            "the plaintext secret must never appear in the cache key"
+        );
+    }
 }
diff --git a/src/main.rs b/src/main.rs
index 3d04529..b748ce2 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -74,6 +74,26 @@ mod nonce;
 /// timestamps caused by client clock skew or deliberate manipulation.
 const REPLAY_WINDOW_SECONDS: i64 = 30;
 
+/// Total number of `/rollback` dispatch attempts before we give up and emit the
+/// terminal `ROLLBACK_FAILED` state (RB-01 / T64). One initial try plus
+/// `ROLLBACK_MAX_ATTEMPTS - 1` retries. Small on purpose: contain/lift are
+/// idempotent, so a couple of quick retries absorb a transient EDR 5xx or a
+/// brief network blip without turning the request handler into a long-running
+/// pager. A persistent failure still surfaces promptly to the human pager.
+const ROLLBACK_MAX_ATTEMPTS: u32 = 3;
+
+/// Base backoff between `/rollback` retry attempts. The delay grows linearly
+/// with the attempt number (attempt 1 waits one base, attempt 2 waits two), a
+/// gentle backoff that stays well inside a normal request timeout. Kept short
+/// because a human approved this rollback and is waiting on the result.
+const ROLLBACK_RETRY_BACKOFF: std::time::Duration = std::time::Duration::from_millis(250);
+
+/// Minimum acceptable length (bytes) of the proxy signing secret (PRX-08).
+/// 32 bytes = 256 bits, matching the HMAC-SHA256 output width. A shorter secret
+/// weakens the one check that authorizes a real EDR action, so it fails closed
+/// in production.
+const MIN_SECRET_LEN: usize = 32;
+
 /// Default per-tenant request budget per one-second window. One tenant's
 /// containment burst can consume at most this many slots per second; once it
 /// is exhausted, only THAT tenant gets 429s. Every other tenant's
@@ -349,6 +369,40 @@ struct ExecuteResponse {
     simulated: bool,
 }
 
+/// Distinct status string the `/rollback` path returns when every internal
+/// retry is exhausted and the lift may not have taken effect. The Python pager
+/// keys on this exact string (and the `502` status) to page a human for manual
+/// reconciliation, distinct from a transient blip that the bounded retry
+/// already recovered from (RB-01 / T64).
+const ROLLBACK_FAILED_STATUS: &str = "rollback_failed";
+
+/// Distinct status string the `/execute` and `/rollback` paths return when an
+/// EDR transport failure is AMBIGUOUS about whether the action ran (timeout
+/// after send, lost response). The nonce is deliberately NOT released, so a
+/// blind retry cannot double-execute; the Python side keys on this to surface a
+/// human-reconciliation task instead of an auto-retry (CNT-05 / T63).
+const NEEDS_RECONCILIATION_STATUS: &str = "needs_reconciliation";
+
+/// Body for the error responses (`/execute` ambiguous, `/rollback` terminal
+/// failure). Distinct from `ExecuteResponse` so the Python caller can pattern
+/// match on `status` without it ever being confused with a success body. The
+/// `may_have_executed` flag is the load-bearing field: true means the EDR may
+/// already have acted and the nonce was held back to block a double-execute.
+#[derive(Debug, Serialize, Deserialize, Clone)]
+struct ActionFailureResponse {
+    /// One of `ROLLBACK_FAILED_STATUS` or `NEEDS_RECONCILIATION_STATUS`.
+    status: String,
+
+    /// True when the action MAY have executed (ambiguous transport failure or
+    /// a rollback whose final attempt was ambiguous). When true the nonce was
+    /// NOT released, so a naive retry will be deduped rather than re-run.
+    may_have_executed: bool,
+
+    /// Honesty label echoed from the request (demo/mock fleet), same meaning
+    /// as on `ExecuteResponse`.
+    simulated: bool,
+}
+
 /// Query parameters for the audit export endpoint.
 #[derive(Debug, Deserialize)]
 struct ExportQuery {
@@ -386,11 +440,7 @@ async fn health() -> Json<serde_json::Value> {
 ///
 /// Thin wrapper over `run_action` with `ActionDirection::Apply`. See
 /// `run_action` for the full step-numbered lifecycle.
-async fn execute(
-    state: State<AppState>,
-    headers: HeaderMap,
-    body: Bytes,
-) -> Result<Json<ExecuteResponse>, StatusCode> {
+async fn execute(state: State<AppState>, headers: HeaderMap, body: Bytes) -> Response {
     run_action(state, headers, body, actions::ActionDirection::Apply).await
 }
 
@@ -402,11 +452,7 @@ async fn execute(
 /// difference is `ActionDirection::Reverse`, which makes the EDR client call
 /// the inverse vendor action, and the audit `action_type` is prefixed
 /// `ROLLBACK_` so the trail names what was undone.
-async fn rollback(
-    state: State<AppState>,
-    headers: HeaderMap,
-    body: Bytes,
-) -> Result<Json<ExecuteResponse>, StatusCode> {
+async fn rollback(state: State<AppState>, headers: HeaderMap, body: Bytes) -> Response {
     run_action(state, headers, body, actions::ActionDirection::Reverse).await
 }
 
@@ -434,7 +480,7 @@ async fn run_action(
     headers: HeaderMap,
     body: Bytes,
     direction: actions::ActionDirection,
-) -> Result<Json<ExecuteResponse>, StatusCode> {
+) -> Response {
     // ─ Step 1: Verify HMAC on the RAW bytes we received. ──────────────
     //
     // Critical correctness point: we verify against `body` (the exact
@@ -443,14 +489,16 @@ async fn run_action(
     // signatures, and historically did - that bug masked the real
     // verification because differing serialization made every signature
     // mismatch indistinguishable from an invalid signature.
-    let signature = headers
+    let Some(signature) = headers
         .get("X-Vyrox-Signature")
         .and_then(|v| v.to_str().ok())
-        .ok_or(StatusCode::UNAUTHORIZED)?;
+    else {
+        return StatusCode::UNAUTHORIZED.into_response();
+    };
 
     if let Err(err) = hmac::verify_signature(state.hmac_secret.as_bytes(), &body, signature) {
         warn!(error = %err, "signature verification failed");
-        return Err(StatusCode::UNAUTHORIZED);
+        return StatusCode::UNAUTHORIZED.into_response();
     }
 
     // ─ Step 2: Parse JSON. ────────────────────────────────────────────
@@ -458,11 +506,12 @@ async fn run_action(
     // Only after the HMAC passes do we trust the body enough to parse
     // it. Parsing before verification would expose any serde panic /
     // pathological input to unauthenticated callers.
-    let payload: ExecuteRequest =
-        serde_json::from_slice(&body).map_err(|_| StatusCode::BAD_REQUEST)?;
+    let Ok(payload) = serde_json::from_slice::<ExecuteRequest>(&body) else {
+        return StatusCode::BAD_REQUEST.into_response();
+    };
 
     if payload.request_id.trim().is_empty() {
-        return Err(StatusCode::BAD_REQUEST);
+        return StatusCode::BAD_REQUEST.into_response();
     }
 
     // ─ Step 2b: Per-tenant rate limit. ────────────────────────────────
@@ -477,11 +526,13 @@ async fn run_action(
             tenant_id = %payload.tenant_id,
             "per-tenant rate limit exceeded"
         );
-        return Err(StatusCode::TOO_MANY_REQUESTS);
+        return StatusCode::TOO_MANY_REQUESTS.into_response();
     }
 
     // ─ Step 3: Replay window. ─────────────────────────────────────────
-    check_replay_window(payload.approved_at)?;
+    if let Err(status) = check_replay_window(payload.approved_at) {
+        return status.into_response();
+    }
 
     // ─ Step 4: Nonce dedup. ───────────────────────────────────────────
     //
@@ -490,30 +541,31 @@ async fn run_action(
     // result and the EDR is NOT called again. A nonce-store transport
     // error (Redis down) fails CLOSED with a 503: skipping dedup could
     // double-execute a containment, so we refuse rather than guess.
-    let claim = state
-        .nonces
-        .claim_or_replay(&payload.request_id)
-        .await
-        .map_err(|err| {
+    let claim = match state.nonces.claim_or_replay(&payload.request_id).await {
+        Ok(claim) => claim,
+        Err(err) => {
             warn!(error = %err, "nonce store unavailable; failing closed");
-            StatusCode::SERVICE_UNAVAILABLE
-        })?;
+            return StatusCode::SERVICE_UNAVAILABLE.into_response();
+        }
+    };
     match claim {
         nonce::Outcome::FreshClaim => { /* fall through to execution */ }
         nonce::Outcome::AlreadyExecuted {
             cached_response_json,
         } => {
             info!(request_id = %payload.request_id, "replaying cached response");
-            let cached: ExecuteResponse = serde_json::from_str(&cached_response_json)
-                .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
-            return Ok(Json(ExecuteResponse {
+            let Ok(cached) = serde_json::from_str::<ExecuteResponse>(&cached_response_json) else {
+                return StatusCode::INTERNAL_SERVER_ERROR.into_response();
+            };
+            return Json(ExecuteResponse {
                 status: "replayed".to_string(),
                 simulated: cached.simulated,
-            }));
+            })
+            .into_response();
         }
         nonce::Outcome::InFlight => {
             warn!(request_id = %payload.request_id, "duplicate while in-flight");
-            return Err(StatusCode::CONFLICT);
+            return StatusCode::CONFLICT.into_response();
         }
     }
 
@@ -540,7 +592,7 @@ async fn run_action(
         // the underlying issue (disk full, perm error) is fixed.
         warn!(error = %err, "audit write failed; releasing nonce claim");
         release_nonce(&state.nonces, &payload.request_id).await;
-        return Err(StatusCode::INTERNAL_SERVER_ERROR);
+        return StatusCode::INTERNAL_SERVER_ERROR.into_response();
     }
 
     // ─ Step 6: Execute. ───────────────────────────────────────────────
@@ -557,81 +609,44 @@ async fn run_action(
     // cannot faithfully perform (e.g. PROCESS_KILL on CrowdStrike) must fail
     // loudly with a 501 before any network call, never be silently substituted.
     // The check is pure (action-name mappers only, no client, no network).
+    //
+    // contain/lift are idempotent, so the Reverse (/rollback) direction wraps
+    // the dispatch in a small bounded retry (RB-01 / T64): a transient EDR 5xx
+    // or a brief blip is absorbed in-proxy rather than immediately paging a
+    // human. /execute keeps a single attempt; a failed isolate is retried at the
+    // approval layer, and silently re-isolating is not something we want this
+    // handler doing on its own.
     let success_status = match direction {
         actions::ActionDirection::Apply => "executed",
         actions::ActionDirection::Reverse => "rolled_back",
     };
-    let supported = edr::check_supported(
+
+    // Pure supportability pre-flight: reject an action the provider cannot
+    // faithfully perform BEFORE any network call (and before the retry loop), so
+    // an unsupported action is one 501 with zero EDR calls, never retried and
+    // never silently substituted. `dispatch` re-checks internally, this keeps
+    // the decision explicit at the handler and out of the retry path.
+    if let Err(err) = edr::check_supported(
         &state.edr,
         payload.edr_credentials.as_ref(),
         payload.action_type,
         direction,
-    );
-    let outcome: Result<ExecuteResponse, edr::EdrError> = match supported {
-        Err(err) => Err(err),
-        Ok(()) => {
-            // Real dispatch. The per-tenant credentials on the request take
-            // precedence over the global env fallback (E7). The EDR client
-            // owns its own retries, timeouts, and error mapping. We echo the
-            // request's `simulated` honesty label back unchanged.
-            edr::dispatch(
-                &state.edr,
-                payload.edr_credentials.as_ref(),
-                payload.action_type,
-                direction,
-                &payload.host,
-            )
-            .await
-            .map(|()| ExecuteResponse {
-                status: success_status.to_string(),
-                simulated: payload.simulated,
-            })
-        }
-    };
-    let response = match outcome {
-        Ok(response) => response,
-        // An action the provider cannot faithfully perform is 501 Not
-        // Implemented (never silently substituted with a different action),
-        // in dry-run and live alike; every other failure is 502 Bad
-        // Gateway. Either way we append a failure audit entry so the trail
-        // records that the intent did NOT happen, then release the nonce so
-        // a retry runs on fresh state.
+    ) {
+        return handle_dispatch_failure(&state, &payload, direction, &audit_action, err).await;
+    }
+
+    let dispatch_result = dispatch_with_retry(&state, &payload, direction).await;
+
+    match dispatch_result {
+        Ok(()) => { /* fall through to success caching + 200 */ }
         Err(err) => {
-            let status = match err {
-                edr::EdrError::Unsupported { .. } => StatusCode::NOT_IMPLEMENTED,
-                _ => StatusCode::BAD_GATEWAY,
-            };
-            warn!(
-                request_id = %payload.request_id,
-                ?direction,
-                error = %err,
-                status = status.as_u16(),
-                "EDR action failed; auditing failure and releasing nonce claim"
-            );
-            // The step-5 entry recorded the INTENT. Without this
-            // companion entry the trail would read as if the action
-            // happened. Best-effort: the failure status is returned
-            // regardless, and a write error is logged, not swallowed
-            // into a fake success.
-            let failure_entry = audit::build_entry(
-                payload.tenant_id.clone(),
-                format!("FAILED_{audit_action}"),
-                payload.host.clone(),
-                payload.approved_by.clone(),
-                payload.simulated,
-            );
-            if let Err(audit_err) =
-                audit::append_audit(&state.audit_log_path, &state.audit_chain, failure_entry).await
-            {
-                warn!(
-                    request_id = %payload.request_id,
-                    error = %audit_err,
-                    "failed to write failure audit entry"
-                );
-            }
-            release_nonce(&state.nonces, &payload.request_id).await;
-            return Err(status);
+            return handle_dispatch_failure(&state, &payload, direction, &audit_action, err).await;
         }
+    }
+
+    let response = ExecuteResponse {
+        status: success_status.to_string(),
+        simulated: payload.simulated,
     };
 
     // ─ Step 7: Cache the response for future retries. ────────────────
@@ -643,8 +658,9 @@ async fn run_action(
     // idempotent EDR action, which the replay window and EDR idempotency both
     // bound, and which is strictly safer than reporting a failure for an action
     // that succeeded.
-    let cache_payload =
-        serde_json::to_string(&response).map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
+    let Ok(cache_payload) = serde_json::to_string(&response) else {
+        return StatusCode::INTERNAL_SERVER_ERROR.into_response();
+    };
     if let Err(err) = state
         .nonces
         .record_response(&payload.request_id, cache_payload)
@@ -657,7 +673,241 @@ async fn run_action(
         );
     }
 
-    Ok(Json(response))
+    Json(response).into_response()
+}
+
+/// Dispatch the action to the EDR, retrying transient failures for the
+/// idempotent `/rollback` (Reverse) direction (RB-01 / T64).
+///
+/// `/execute` (Apply) dispatches exactly once: re-isolating a host on a blip is
+/// the approval layer's call, not this handler's. `/rollback` (Reverse) retries
+/// up to `ROLLBACK_MAX_ATTEMPTS` times because contain/lift are idempotent and a
+/// transient EDR 5xx or a momentary blip should not page a human when one more
+/// quick attempt would succeed.
+///
+/// What is retried: only genuinely transient, retry-safe failures, a server
+/// 5xx, or an ambiguous transport error where the lift may not have landed.
+/// What is NOT retried: a pre-send failure (returned immediately so the nonce
+/// can be released and the request retried cleanly from scratch), a 4xx client
+/// error (a bad request will not get better by repeating it), and `Unsupported`
+/// / `Misconfigured` (decided before any call). The supportability check that
+/// rejects unsupported actions is pure and runs inside `dispatch`, so an
+/// unsupported action still fails on the first attempt with zero calls.
+async fn dispatch_with_retry(
+    state: &AppState,
+    payload: &ExecuteRequest,
+    direction: actions::ActionDirection,
+) -> Result<(), edr::EdrError> {
+    let max_attempts = match direction {
+        actions::ActionDirection::Apply => 1,
+        actions::ActionDirection::Reverse => ROLLBACK_MAX_ATTEMPTS,
+    };
+
+    let mut attempt = 0;
+    loop {
+        attempt += 1;
+        let result = edr::dispatch(
+            &state.edr,
+            payload.edr_credentials.as_ref(),
+            payload.action_type,
+            direction,
+            &payload.host,
+        )
+        .await;
+
+        let err = match result {
+            Ok(()) => return Ok(()),
+            Err(err) => err,
+        };
+
+        // Stop early when another attempt cannot help: we are out of attempts,
+        // the failure is not retry-safe (4xx, pre-send, unsupported,
+        // misconfigured), or this is the Apply direction (max_attempts == 1).
+        if attempt >= max_attempts || !is_retryable(&err) {
+            return Err(err);
+        }
+
+        warn!(
+            request_id = %payload.request_id,
+            ?direction,
+            attempt,
+            max_attempts,
+            error = %err,
+            "rollback dispatch failed transiently; retrying after backoff"
+        );
+        // Linear backoff: attempt 1 waits one base, attempt 2 waits two. Short
+        // by design, a human approved this rollback and is waiting on it.
+        tokio::time::sleep(ROLLBACK_RETRY_BACKOFF * attempt).await;
+    }
+}
+
+/// Whether a dispatch error is worth another attempt for an idempotent action.
+///
+/// Only a server 5xx and an ambiguous transport error qualify: both can be a
+/// momentary condition that a quick retry of an idempotent contain/lift clears.
+/// A 4xx will not improve on repeat, and a pre-send / unsupported /
+/// misconfigured error is handled by releasing the nonce (pre-send) or failing
+/// loudly, never by retrying.
+fn is_retryable(err: &edr::EdrError) -> bool {
+    matches!(
+        err,
+        edr::EdrError::ServerError { .. } | edr::EdrError::Transport(_)
+    )
+}
+
+/// Build the failure response (and companion audit entries) after a dispatch
+/// failed, honoring the exactly-once and rollback-terminal-state contracts.
+///
+/// Three distinct outcomes, by error class and direction:
+///
+/// 1. **Unsupported** (`/execute` or `/rollback`): 501 Not Implemented. The
+///    action has no faithful provider mapping; never substituted. The nonce is
+///    released, a `FAILED_` entry records the intent did not happen.
+/// 2. **Pre-send transport failure** (connection refused, DNS): the action
+///    provably did NOT run, so we release the nonce so a retry runs cleanly on
+///    fresh state, write a `FAILED_` entry, and return 502.
+/// 3. **Ambiguous transport failure** (timeout after send, lost response): the
+///    action MAY have run. We do NOT release the nonce, a blind retry would risk
+///    a double-execute. We write a distinct `NEEDS_RECONCILIATION_` audit entry
+///    making clear the action may have executed, and return a body the Python
+///    pager keys on (`needs_reconciliation`, `may_have_executed: true`).
+/// 4. **Rollback that exhausted its retries** (`/rollback` only, server 5xx or
+///    ambiguous after `ROLLBACK_MAX_ATTEMPTS`): a distinct terminal
+///    `ROLLBACK_FAILED` audit entry + `rollback_failed` response body the pager
+///    keys on. The nonce is released for a 5xx (the lift did not land, a fresh
+///    retry is safe) and held for an ambiguous final attempt (it may have).
+///
+/// Audit-before-act ordering is preserved: the intent entry was already written
+/// in step 5, and every companion entry here is written BEFORE this returns.
+async fn handle_dispatch_failure(
+    state: &AppState,
+    payload: &ExecuteRequest,
+    direction: actions::ActionDirection,
+    audit_action: &str,
+    err: edr::EdrError,
+) -> Response {
+    let is_rollback = matches!(direction, actions::ActionDirection::Reverse);
+
+    // An action with no faithful provider mapping is 501 and is never retried or
+    // reconciled: nothing ran, release the nonce and audit the non-event.
+    if matches!(err, edr::EdrError::Unsupported { .. }) {
+        warn!(
+            request_id = %payload.request_id,
+            ?direction,
+            error = %err,
+            "EDR action unsupported; auditing non-event and releasing nonce"
+        );
+        write_failure_entry(state, payload, &format!("FAILED_{audit_action}")).await;
+        release_nonce(&state.nonces, &payload.request_id).await;
+        return StatusCode::NOT_IMPLEMENTED.into_response();
+    }
+
+    // Did the side effect provably NOT happen? Drives both the nonce-release
+    // decision (release only when safe) and the audit/response wording.
+    let safe_to_release = err.side_effect_definitely_did_not_happen();
+
+    // The terminal label distinguishes a rollback that exhausted its retries
+    // (the pager treats it as needing human attention) from a one-shot execute
+    // failure, and an ambiguous "may have executed" from a clean non-event.
+    let (audit_label, response): (String, Response) = if is_rollback {
+        // /rollback exhausted ROLLBACK_MAX_ATTEMPTS (or hit a non-retryable
+        // error). Emit the distinct terminal ROLLBACK_FAILED state the Python
+        // pager keys on, rather than a generic 502 it has to infer from.
+        warn!(
+            request_id = %payload.request_id,
+            error = %err,
+            safe_to_release,
+            "rollback failed after internal retries; emitting terminal ROLLBACK_FAILED"
+        );
+        let body = ActionFailureResponse {
+            status: ROLLBACK_FAILED_STATUS.to_string(),
+            may_have_executed: !safe_to_release,
+            simulated: payload.simulated,
+        };
+        (
+            format!("ROLLBACK_FAILED_{audit_action}"),
+            (StatusCode::BAD_GATEWAY, Json(body)).into_response(),
+        )
+    } else if safe_to_release {
+        // /execute, pre-send: the isolate provably did not run. A FAILED_ entry
+        // records the non-event; the nonce is released below so a clean retry
+        // can run on fresh state. Plain 502.
+        warn!(
+            request_id = %payload.request_id,
+            error = %err,
+            "EDR execute failed pre-send; auditing non-event and releasing nonce"
+        );
+        (
+            format!("FAILED_{audit_action}"),
+            StatusCode::BAD_GATEWAY.into_response(),
+        )
+    } else {
+        // /execute, ambiguous: the isolate MAY have run. Hold the nonce so a
+        // blind retry cannot double-execute, and surface a distinct
+        // needs-reconciliation state with may_have_executed:true.
+        warn!(
+            request_id = %payload.request_id,
+            error = %err,
+            "EDR execute transport failure is ambiguous; holding nonce and flagging for \
+             human reconciliation (the action MAY have executed)"
+        );
+        let body = ActionFailureResponse {
+            status: NEEDS_RECONCILIATION_STATUS.to_string(),
+            may_have_executed: true,
+            simulated: payload.simulated,
+        };
+        (
+            format!("NEEDS_RECONCILIATION_{audit_action}"),
+            (StatusCode::BAD_GATEWAY, Json(body)).into_response(),
+        )
+    };
+
+    // Audit the outcome BEFORE returning. The step-5 entry recorded the intent;
+    // without this companion the trail would read as if the action happened (or
+    // as if it cleanly failed when it may not have).
+    write_failure_entry(state, payload, &audit_label).await;
+
+    // Release the nonce ONLY when the side effect provably did not happen. This
+    // is the exactly-once invariant: an ambiguous failure keeps the claim so the
+    // next retry is deduped (replayed/InFlight), never re-executed. nonce.rs
+    // warns to release only when sure the side effect did not happen.
+    if safe_to_release {
+        release_nonce(&state.nonces, &payload.request_id).await;
+    } else {
+        warn!(
+            request_id = %payload.request_id,
+            "NOT releasing nonce: the EDR action may have executed; a retry must be deduped, \
+             not re-run, until a human reconciles"
+        );
+    }
+
+    response
+}
+
+/// Append a companion failure/outcome audit entry (best-effort).
+///
+/// The action-type label (`FAILED_...`, `ROLLBACK_FAILED_...`,
+/// `NEEDS_RECONCILIATION_...`) names what happened so the trail never reads as a
+/// clean success. A write error is logged, not swallowed into a fake success and
+/// not propagated, the caller is already returning a failure status.
+async fn write_failure_entry(state: &AppState, payload: &ExecuteRequest, audit_label: &str) {
+    let entry = audit::build_entry(
+        payload.tenant_id.clone(),
+        audit_label.to_string(),
+        payload.host.clone(),
+        payload.approved_by.clone(),
+        payload.simulated,
+    );
+    if let Err(audit_err) =
+        audit::append_audit(&state.audit_log_path, &state.audit_chain, entry).await
+    {
+        warn!(
+            request_id = %payload.request_id,
+            error = %audit_err,
+            audit_label,
+            "failed to write failure audit entry"
+        );
+    }
 }
 
 /// `GET /audit/export?tenant_id=<id>`
@@ -846,6 +1096,27 @@ fn resolve_proxy_secret(is_production: bool) -> String {
     );
 }
 
+/// Decide whether a signing secret is too weak to serve production traffic
+/// (PRX-08).
+///
+/// Returns `Some(message)` when the boot must fail closed: a secret shorter than
+/// `MIN_SECRET_LEN` (256 bits) in production. Returns `None` otherwise, including
+/// for a short secret in dev/CI (the caller warns instead). Extracted as a pure
+/// function so the fail-closed rule is unit-testable without booting the async
+/// runtime, mirroring `resolve_proxy_secret`.
+fn secret_strength_error(secret: &str, is_production: bool) -> Option<String> {
+    if is_production && secret.len() < MIN_SECRET_LEN {
+        return Some(format!(
+            "proxy signing secret is only {} bytes; refusing to start in production with a \
+             signing secret shorter than {MIN_SECRET_LEN} bytes (256 bits), which weakens the \
+             HMAC that authorizes every EDR action. Set a >= {MIN_SECRET_LEN}-byte \
+             VYROX_PROXY_SECRET (PRX-08).",
+            secret.len()
+        ));
+    }
+    None
+}
+
 /// Decide whether this is a production boot.
 ///
 /// The proxy has no first-class environment field, so it reads the same
@@ -989,8 +1260,21 @@ async fn run() {
     // disable auth. (Per-tenant HKDF key derivation is deferred to keep this
     // wave in lockstep with the Python side.)
     let hmac_secret = resolve_proxy_secret(is_production);
-    if hmac_secret.len() < 32 {
-        warn!("proxy signing secret is shorter than 32 bytes; consider rotating to a longer key");
+    // PRX-08: fail closed on a weak signing secret in production (see
+    // `secret_strength_error`). The check is extracted so it is unit-testable
+    // without booting the runtime; here we turn its verdict into a panic (prod)
+    // or a warning (dev/CI).
+    match secret_strength_error(&hmac_secret, is_production) {
+        Some(message) => panic!("{message}"),
+        None if hmac_secret.len() < MIN_SECRET_LEN => {
+            warn!(
+                secret_len = hmac_secret.len(),
+                min_len = MIN_SECRET_LEN,
+                "proxy signing secret is shorter than the minimum; this fails closed in \
+                 production. Rotate to a longer key (PRX-08)."
+            );
+        }
+        None => {}
     }
 
     let audit_log_path = env::var("AUDIT_LOG_PATH").unwrap_or_else(|_| "./audit.jsonl".to_string());
@@ -1935,11 +2219,30 @@ mod tests {
 
     #[tokio::test]
     async fn rollback_against_failing_edr_is_bad_gateway() {
-        // A mock EDR that always 500s on the action endpoint. The proxy must
-        // surface that as 502 BAD_GATEWAY (the Python side maps that to a
-        // paged ROLLBACK_FAILED), never a silent success.
-        let state: MockEdrState = Arc::new(StdMutex::new(HashMap::new()));
+        // A mock EDR that always 500s on the action endpoint. After the bounded
+        // internal retries (RB-01 / T64) the proxy must surface that as 502
+        // BAD_GATEWAY with the distinct terminal ROLLBACK_FAILED body the Python
+        // pager keys on, never a silent success.
+        let (router, _dir) = always_500_crowdstrike_router("rb-fail").await;
+        let creds = json!({
+            "provider": "crowdstrike",
+            "api_key": "id",
+            "api_secret": "secret",
+            "base_url": router.1,
+        });
+        let body = action_body("e2e-rb-fail", Some(creds));
+        let (status, value) = post_json(&router.0, "/rollback", body).await;
+        assert_eq!(status, StatusCode::BAD_GATEWAY);
+        assert_eq!(
+            value["status"], ROLLBACK_FAILED_STATUS,
+            "a rollback that exhausts its retries must report the terminal ROLLBACK_FAILED state"
+        );
+    }
 
+    /// Build a router plus a CrowdStrike-shaped mock whose token endpoint works
+    /// but whose device-action endpoint ALWAYS 500s, for the RB-01 terminal-state
+    /// tests. Returns (router, base_url) and a tempdir kept alive by the caller.
+    async fn always_500_crowdstrike_router(_tag: &str) -> ((Router, String), TempDir) {
         async fn token() -> Json<serde_json::Value> {
             Json(json!({"access_token": "t", "expires_in": 1800}))
         }
@@ -1952,7 +2255,127 @@ mod tests {
                 "/devices/entities/devices-actions/v2",
                 axum::routing::post(always_500),
             )
-            .with_state(state);
+            .with_state(());
+        let listener = TcpListener::bind("127.0.0.1:0").await.expect("bind");
+        let addr: SocketAddr = listener.local_addr().expect("addr");
+        tokio::spawn(async move {
+            axum::serve(listener, app).await.expect("serve");
+        });
+        let (router, dir) = test_router(edr::EdrClient::Noop);
+        ((router, format!("http://{addr}")), dir)
+    }
+
+    // ── T63 / CNT-05: pre-send vs ambiguous transport failure ──────────────
+
+    #[tokio::test]
+    async fn execute_presend_failure_releases_nonce_for_clean_retry() {
+        // A connection refused on /execute is a PRE-SEND failure: the isolate
+        // provably did not run, so the nonce is released and a retry with the
+        // SAME request_id re-dispatches (502 again) rather than being deduped to
+        // 409. If the nonce had been wrongly held, the retry would be 409.
+        let (router, dir) = test_router(edr::EdrClient::Noop);
+        let creds = json!({
+            "provider": "crowdstrike",
+            "api_key": "id",
+            "api_secret": "secret",
+            "base_url": "http://127.0.0.1:1", // nothing listening -> connect refused
+        });
+        let body = action_body("presend-1", Some(creds.clone()));
+        let (s1, _) = post_json(&router, "/execute", body).await;
+        assert_eq!(s1, StatusCode::BAD_GATEWAY, "pre-send failure is 502");
+
+        // Same request_id again: a released nonce means a FRESH claim and another
+        // real attempt (502), NOT a 409 in-flight.
+        let body2 = action_body("presend-1", Some(creds));
+        let (s2, _) = post_json(&router, "/execute", body2).await;
+        assert_eq!(
+            s2,
+            StatusCode::BAD_GATEWAY,
+            "a pre-send failure must release the nonce so the retry re-dispatches, not 409"
+        );
+
+        // The trail records the non-event as FAILED_, never NEEDS_RECONCILIATION_.
+        let log = std::fs::read_to_string(dir.path().join("audit.jsonl")).expect("audit log");
+        assert!(log.contains("\"FAILED_HostIsolation\""));
+        assert!(!log.contains("NEEDS_RECONCILIATION"));
+    }
+
+    #[tokio::test]
+    async fn execute_ambiguous_failure_holds_nonce_and_flags_reconciliation() {
+        // A 5xx from the EDR on /execute is AMBIGUOUS: the EDR was contacted and
+        // may have acted. The proxy must NOT release the nonce (a blind retry
+        // would risk a double-isolate) and must surface the distinct
+        // needs_reconciliation state with may_have_executed:true. A retry with
+        // the same request_id is then deduped to 409 (the claim is still
+        // in-flight), proving the nonce was held.
+        let ((router, base_url), dir) = always_500_crowdstrike_router("amb").await;
+        let creds = json!({
+            "provider": "crowdstrike",
+            "api_key": "id",
+            "api_secret": "secret",
+            "base_url": base_url,
+        });
+        let body = action_body("ambiguous-1", Some(creds.clone()));
+        let (s1, v1) = post_json(&router, "/execute", body).await;
+        assert_eq!(s1, StatusCode::BAD_GATEWAY);
+        assert_eq!(
+            v1["status"], NEEDS_RECONCILIATION_STATUS,
+            "an ambiguous execute failure must flag for human reconciliation"
+        );
+        assert_eq!(
+            v1["may_have_executed"], true,
+            "the response must warn the action may have executed"
+        );
+
+        // Same request_id again: the nonce was held (in-flight), so this dedups
+        // to 409 rather than re-executing.
+        let body2 = action_body("ambiguous-1", Some(creds));
+        let s2 = post(&router, "/execute", body2.clone(), Some(sign_body(&body2))).await;
+        assert_eq!(
+            s2,
+            StatusCode::CONFLICT,
+            "an ambiguous failure must HOLD the nonce: the retry is deduped, not re-run"
+        );
+
+        // The trail records the ambiguity, not a clean failure.
+        let log = std::fs::read_to_string(dir.path().join("audit.jsonl")).expect("audit log");
+        assert!(
+            log.contains("\"NEEDS_RECONCILIATION_HostIsolation\""),
+            "the audit entry must make clear the action may have executed"
+        );
+    }
+
+    // ── T64 / RB-01: bounded rollback retry + terminal ROLLBACK_FAILED ─────
+
+    #[tokio::test]
+    async fn rollback_retries_transient_failure_then_succeeds() {
+        // A mock that 500s on the FIRST device-action hit and 200s afterward.
+        // The bounded internal retry must absorb the transient failure and the
+        // rollback must ultimately succeed (200 rolled_back), with no human page.
+        let attempt = Arc::new(StdMutex::new(0u32));
+
+        async fn token() -> Json<serde_json::Value> {
+            Json(json!({"access_token": "t", "expires_in": 1800}))
+        }
+        async fn flaky(
+            State(attempt): State<Arc<StdMutex<u32>>>,
+        ) -> Result<Json<serde_json::Value>, StatusCode> {
+            let mut n = attempt.lock().unwrap();
+            *n += 1;
+            if *n == 1 {
+                // First attempt: transient server error.
+                Err(StatusCode::INTERNAL_SERVER_ERROR)
+            } else {
+                Ok(Json(json!({ "resources": [], "errors": [] })))
+            }
+        }
+        let app = Router::new()
+            .route("/oauth2/token", axum::routing::post(token))
+            .route(
+                "/devices/entities/devices-actions/v2",
+                axum::routing::post(flaky),
+            )
+            .with_state(attempt.clone());
         let listener = TcpListener::bind("127.0.0.1:0").await.expect("bind");
         let addr: SocketAddr = listener.local_addr().expect("addr");
         tokio::spawn(async move {
@@ -1966,9 +2389,84 @@ mod tests {
             "api_secret": "secret",
             "base_url": format!("http://{addr}"),
         });
-        let body = action_body("e2e-rb-fail", Some(creds));
-        let sig = sign_body(&body);
-        let status = post(&router, "/rollback", body, Some(sig)).await;
+        let body = action_body("rb-retry-ok", Some(creds));
+        let (status, value) = post_json(&router, "/rollback", body).await;
+        assert_eq!(
+            status,
+            StatusCode::OK,
+            "the bounded retry must recover a transient rollback failure"
+        );
+        assert_eq!(value["status"], "rolled_back");
+        assert!(
+            *attempt.lock().unwrap() >= 2,
+            "the rollback must have retried at least once"
+        );
+    }
+
+    #[tokio::test]
+    async fn rollback_exhausts_retries_then_emits_terminal_rollback_failed() {
+        // A device-action that ALWAYS 500s. After ROLLBACK_MAX_ATTEMPTS the proxy
+        // emits the distinct terminal ROLLBACK_FAILED audit entry + body. A 5xx
+        // is AMBIGUOUS about whether the EDR partially applied the lift, so the
+        // nonce is HELD (may_have_executed:true) rather than released: the human
+        // pager reconciles, and a blind retry is deduped to 409 instead of
+        // risking a second uncoordinated lift.
+        let ((router, base_url), dir) = always_500_crowdstrike_router("rb-exhaust").await;
+        let creds = json!({
+            "provider": "crowdstrike",
+            "api_key": "id",
+            "api_secret": "secret",
+            "base_url": base_url,
+        });
+        let body = action_body("rb-exhaust-1", Some(creds.clone()));
+        let (status, value) = post_json(&router, "/rollback", body).await;
         assert_eq!(status, StatusCode::BAD_GATEWAY);
+        assert_eq!(value["status"], ROLLBACK_FAILED_STATUS);
+        assert_eq!(
+            value["may_have_executed"], true,
+            "a server 5xx is ambiguous: the EDR may have partially applied the lift"
+        );
+
+        // The terminal state is in the audit trail under a distinct label.
+        let log = std::fs::read_to_string(dir.path().join("audit.jsonl")).expect("audit log");
+        assert!(
+            log.contains("\"ROLLBACK_FAILED_ROLLBACK_HostIsolation\""),
+            "the trail must carry the distinct terminal ROLLBACK_FAILED label, saw: {log}"
+        );
+
+        // The ambiguous failure HELD the nonce: a retry is deduped to 409, never
+        // a second uncoordinated lift, until a human reconciles.
+        let body2 = action_body("rb-exhaust-1", Some(creds));
+        let s2 = post(&router, "/rollback", body2.clone(), Some(sign_body(&body2))).await;
+        assert_eq!(
+            s2,
+            StatusCode::CONFLICT,
+            "an ambiguous rollback failure holds the nonce: the retry is deduped, not re-run"
+        );
+    }
+
+    // ── PRX-08: weak signing secret fails closed in production ─────────────
+
+    #[test]
+    fn secret_strength_error_fails_closed_only_in_production() {
+        let short = "too-short"; // < 32 bytes
+        let long = "this-secret-is-definitely-at-least-32-bytes-long"; // >= 32
+
+        // Production + short secret -> hard error (the caller panics on this).
+        assert!(
+            secret_strength_error(short, true).is_some(),
+            "a short secret must fail closed in production"
+        );
+        // Production + long secret -> fine.
+        assert!(
+            secret_strength_error(long, true).is_none(),
+            "a >= 32-byte secret is accepted in production"
+        );
+        // Dev/CI + short secret -> no hard error (the caller only warns).
+        assert!(
+            secret_strength_error(short, false).is_none(),
+            "a short secret only warns in dev/CI, never blocks"
+        );
+        assert!(secret_strength_error(long, false).is_none());
     }
 }