From 1e220617563e902d04d9397f51b40153ca735d46 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 15:10:26 +0300 Subject: [PATCH 01/49] refactor(sandbox): extract run_networking from run_sandbox MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lifts TLS state generation, network namespace setup, proxy startup, bypass monitor spawn, and SSH-side proxy URL / netns FD computation out of run_sandbox into a sibling async fn `run_networking` that returns a Networking struct. The identity cache moves with it (only consumed by the proxy). Entrypoint PID allocation moves just above the call site so it can be passed in. No behavior changes — same OCSF emits, same async order, same RAII lifetimes for the proxy and bypass-monitor handles, now held by the returned Networking value in run_sandbox's frame. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 531 +++++++++++++++------------- 1 file changed, 295 insertions(+), 236 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 4a0e61e57..64f9448a9 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -275,6 +275,274 @@ fn is_managed_child(pid: i32) -> bool { .is_ok_and(|children| children.contains(&pid)) } +/// Handles and values produced by [`run_networking`] that the rest of +/// `run_sandbox` consumes. +/// +/// The two `_proxy` / `_bypass_monitor` fields are RAII handles whose drop +/// tears down the proxy and bypass-monitor tasks. They must remain alive for +/// the duration of the sandbox wait loop, which is achieved by holding the +/// returned `Networking` value in `run_sandbox`'s frame. +struct Networking { + #[allow(dead_code, reason = "RAII handle: drop tears down the proxy task")] + _proxy: Option, + #[cfg(target_os = "linux")] + #[allow(dead_code, reason = "RAII handle: drop joins the bypass monitor task")] + _bypass_monitor: Option>, + + #[cfg(target_os = "linux")] + netns: Option, + ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, + ssh_proxy_url: Option, + ssh_netns_fd: Option, + denial_rx: Option>, +} + +/// Set up the networking stack: ephemeral CA + TLS state, network namespace, +/// proxy server, bypass monitor, and the SSH-side proxy URL / netns FD. +#[allow(clippy::too_many_arguments)] +async fn run_networking( + policy: &SandboxPolicy, + opa_engine: Option<&Arc>, + entrypoint_pid: Arc, + provider_credentials: &provider_credentials::ProviderCredentialState, + policy_local_ctx: &Arc, + sandbox_id: Option<&str>, + openshell_endpoint: Option<&str>, + inference_routes: Option<&str>, +) -> Result { + // Identity cache for SHA256 TOFU when OPA is active. Only consumed by + // the proxy, so it's owned here. + let identity_cache = opa_engine.map(|_| Arc::new(BinaryIdentityCache::new())); + + // Generate ephemeral CA and TLS state for HTTPS L7 inspection. + // The CA cert is written to disk so sandbox processes can trust it. + let (tls_state, ca_file_paths) = if matches!(policy.network.mode, NetworkMode::Proxy) { + match SandboxCa::generate() { + Ok(ca) => { + let tls_dir = std::path::Path::new("/etc/openshell-tls"); + let system_ca_bundle = read_system_ca_bundle(); + match write_ca_files(&ca, tls_dir, &system_ca_bundle) { + Ok(paths) => { + // /etc/openshell-tls is subsumed by the /etc baseline + // path injected by enrich_*_baseline_paths(), so no + // explicit Landlock entry is needed here. + + let upstream_config = build_upstream_client_config(&system_ca_bundle); + let cert_cache = CertCache::new(ca); + let state = Arc::new(ProxyTlsState::new(cert_cache, upstream_config)); + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Enabled, "enabled") + .message("TLS termination enabled: ephemeral CA generated") + .build() + ); + (Some(state), Some(paths)) + } + Err(e) => { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .state(StateId::Disabled, "disabled") + .message(format!( + "Failed to write CA files, TLS termination disabled: {e}" + )) + .build() + ); + (None, None) + } + } + } + Err(e) => { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .state(StateId::Disabled, "disabled") + .message(format!( + "Failed to generate ephemeral CA, TLS termination disabled: {e}" + )) + .build() + ); + (None, None) + } + } + } else { + (None, None) + }; + + // Create network namespace for proxy mode (Linux only). + // This must be created before the proxy AND SSH server so that SSH + // sessions can enter the namespace for network isolation. + #[cfg(target_os = "linux")] + let netns = if matches!(policy.network.mode, NetworkMode::Proxy) { + match NetworkNamespace::create() { + Ok(ns) => { + // Install bypass detection rules (nftables log + reject). + // This provides fast-fail UX and diagnostic logging for direct + // connection attempts that bypass the HTTP CONNECT proxy. + let proxy_port = policy + .network + .proxy + .as_ref() + .and_then(|p| p.http_addr) + .map_or(3128, |addr| addr.port()); + if let Err(e) = ns.install_bypass_rules(proxy_port) { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .state(StateId::Disabled, "degraded") + .message(format!( + "Failed to install bypass detection rules (non-fatal): {e}" + )) + .build() + ); + } + Some(ns) + } + Err(e) => { + return Err(miette::miette!( + "Network namespace creation failed and proxy mode requires isolation. \ + Ensure CAP_NET_ADMIN and CAP_SYS_ADMIN are available and iproute2 is installed. \ + Error: {e}" + )); + } + } + } else { + None + }; + + let (proxy_handle, denial_rx, bypass_denial_tx) = + if matches!(policy.network.mode, NetworkMode::Proxy) { + let proxy_policy = policy.network.proxy.as_ref().ok_or_else(|| { + miette::miette!( + "Network mode is set to proxy but no proxy configuration was provided" + ) + })?; + + let engine = opa_engine.cloned().ok_or_else(|| { + miette::miette!("Proxy mode requires an OPA engine (--rego-policy and --rego-data)") + })?; + + let cache = identity_cache.clone().ok_or_else(|| { + miette::miette!( + "Proxy mode requires an identity cache (OPA engine must be configured)" + ) + })?; + + // If we have a network namespace, bind to the veth host IP so sandboxed + // processes can reach the proxy via TCP. + #[cfg(target_os = "linux")] + let bind_addr = netns.as_ref().map(|ns| { + let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); + SocketAddr::new(ns.host_ip(), port) + }); + + #[cfg(not(target_os = "linux"))] + let bind_addr: Option = None; + + // Build inference context for local routing of intercepted inference calls. + let inference_ctx = + build_inference_context(sandbox_id, openshell_endpoint, inference_routes).await?; + + // Create denial aggregator channel if in gRPC mode (sandbox_id present). + // Clone the sender for the bypass monitor before passing to the proxy. + let (denial_tx, denial_rx, bypass_denial_tx) = if sandbox_id.is_some() { + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + let bypass_tx = tx.clone(); + (Some(tx), Some(rx), Some(bypass_tx)) + } else { + (None, None, None) + }; + + let proxy_handle = ProxyHandle::start_with_bind_addr( + proxy_policy, + bind_addr, + engine, + cache, + entrypoint_pid.clone(), + tls_state, + inference_ctx, + Some(provider_credentials.clone()), + Some(policy_local_ctx.clone()), + denial_tx, + ) + .await?; + (Some(proxy_handle), denial_rx, bypass_denial_tx) + } else { + (None, None, None) + }; + + // Spawn bypass detection monitor (Linux only, proxy mode only). + // Reads /dev/kmsg for nftables log entries and emits structured + // tracing events for direct connection attempts that bypass the proxy. + #[cfg(target_os = "linux")] + let bypass_monitor_handle = netns.as_ref().and_then(|ns| { + bypass_monitor::spawn( + ns.name().to_string(), + entrypoint_pid.clone(), + bypass_denial_tx, + ) + }); + + // On non-Linux, bypass_denial_tx is unused (no /dev/kmsg). + #[cfg(not(target_os = "linux"))] + drop(bypass_denial_tx); + + // Compute the proxy URL and netns fd for SSH sessions. + // SSH shell processes need both to enforce network policy: + // - netns_fd: enter the network namespace via setns() so all traffic + // goes through the veth pair (hard enforcement, non-bypassable) + // - proxy_url: set proxy env vars so cooperative tools route through the + // CONNECT proxy; this also opts Node.js into honoring those vars + #[cfg(target_os = "linux")] + let ssh_netns_fd = netns.as_ref().and_then(NetworkNamespace::ns_fd); + + #[cfg(not(target_os = "linux"))] + let ssh_netns_fd: Option = None; + + let ssh_proxy_url = if matches!(policy.network.mode, NetworkMode::Proxy) { + #[cfg(target_os = "linux")] + { + netns.as_ref().map(|ns| { + let port = policy + .network + .proxy + .as_ref() + .and_then(|p| p.http_addr) + .map_or(3128, |addr| addr.port()); + format!("http://{}:{port}", ns.host_ip()) + }) + } + #[cfg(not(target_os = "linux"))] + { + policy + .network + .proxy + .as_ref() + .and_then(|p| p.http_addr) + .map(|addr| format!("http://{addr}")) + } + } else { + None + }; + + Ok(Networking { + _proxy: proxy_handle, + #[cfg(target_os = "linux")] + _bypass_monitor: bypass_monitor_handle, + #[cfg(target_os = "linux")] + netns, + ca_file_paths, + ssh_proxy_url, + ssh_netns_fd, + denial_rx, + }) +} + /// Run a command in the sandbox. /// /// # Errors @@ -406,11 +674,6 @@ pub async fn run_sandbox( ); let provider_env = provider_credentials.snapshot().child_env.clone(); - // Create identity cache for SHA256 TOFU when OPA is active - let identity_cache = opa_engine - .as_ref() - .map(|_| Arc::new(BinaryIdentityCache::new())); - // Prepare filesystem: create and chown read_write directories prepare_filesystem(&policy)?; @@ -467,236 +730,27 @@ pub async fn run_sandbox( debug!("agent_policy_proposals_enabled is false at startup; skipping skill install"); } - // Generate ephemeral CA and TLS state for HTTPS L7 inspection. - // The CA cert is written to disk so sandbox processes can trust it. - let (tls_state, ca_file_paths) = if matches!(policy.network.mode, NetworkMode::Proxy) { - match SandboxCa::generate() { - Ok(ca) => { - let tls_dir = std::path::Path::new("/etc/openshell-tls"); - let system_ca_bundle = read_system_ca_bundle(); - match write_ca_files(&ca, tls_dir, &system_ca_bundle) { - Ok(paths) => { - // /etc/openshell-tls is subsumed by the /etc baseline - // path injected by enrich_*_baseline_paths(), so no - // explicit Landlock entry is needed here. - - let upstream_config = build_upstream_client_config(&system_ca_bundle); - let cert_cache = CertCache::new(ca); - let state = Arc::new(ProxyTlsState::new(cert_cache, upstream_config)); - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Enabled, "enabled") - .message("TLS termination enabled: ephemeral CA generated") - .build() - ); - (Some(state), Some(paths)) - } - Err(e) => { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .state(StateId::Disabled, "disabled") - .message(format!( - "Failed to write CA files, TLS termination disabled: {e}" - )) - .build() - ); - (None, None) - } - } - } - Err(e) => { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .state(StateId::Disabled, "disabled") - .message(format!( - "Failed to generate ephemeral CA, TLS termination disabled: {e}" - )) - .build() - ); - (None, None) - } - } - } else { - (None, None) - }; - - // Create network namespace for proxy mode (Linux only) - // This must be created before the proxy AND SSH server so that SSH - // sessions can enter the namespace for network isolation. - #[cfg(target_os = "linux")] - let netns = if matches!(policy.network.mode, NetworkMode::Proxy) { - match NetworkNamespace::create() { - Ok(ns) => { - // Install bypass detection rules (nftables log + reject). - // This provides fast-fail UX and diagnostic logging for direct - // connection attempts that bypass the HTTP CONNECT proxy. - let proxy_port = policy - .network - .proxy - .as_ref() - .and_then(|p| p.http_addr) - .map_or(3128, |addr| addr.port()); - if let Err(e) = ns.install_bypass_rules(proxy_port) { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .state(StateId::Disabled, "degraded") - .message(format!( - "Failed to install bypass detection rules (non-fatal): {e}" - )) - .build() - ); - } - Some(ns) - } - Err(e) => { - return Err(miette::miette!( - "Network namespace creation failed and proxy mode requires isolation. \ - Ensure CAP_NET_ADMIN and CAP_SYS_ADMIN are available and iproute2 is installed. \ - Error: {e}" - )); - } - } - } else { - None - }; + // Shared PID: set after process spawn so the proxy can look up + // the entrypoint process's /proc/net/tcp for identity binding. + let entrypoint_pid = Arc::new(AtomicU32::new(0)); - // On non-Linux, network namespace isolation is not supported - #[cfg(not(target_os = "linux"))] - #[allow(clippy::no_effect_underscore_binding)] - let _netns: Option<()> = None; + let mut networking = run_networking( + &policy, + opa_engine.as_ref(), + entrypoint_pid.clone(), + &provider_credentials, + &policy_local_ctx, + sandbox_id.as_deref(), + openshell_endpoint_for_proxy.as_deref(), + inference_routes.as_deref(), + ) + .await?; // Install the supervisor seccomp prelude after privileged startup helpers // (network namespace setup, nftables probes) complete, but before the SSH // listener and workload process are exposed. apply_supervisor_startup_hardening()?; - // Shared PID: set after process spawn so the proxy can look up - // the entrypoint process's /proc/net/tcp for identity binding. - let entrypoint_pid = Arc::new(AtomicU32::new(0)); - - let (_proxy, denial_rx, bypass_denial_tx) = if matches!(policy.network.mode, NetworkMode::Proxy) - { - let proxy_policy = policy.network.proxy.as_ref().ok_or_else(|| { - miette::miette!("Network mode is set to proxy but no proxy configuration was provided") - })?; - - let engine = opa_engine.clone().ok_or_else(|| { - miette::miette!("Proxy mode requires an OPA engine (--rego-policy and --rego-data)") - })?; - - let cache = identity_cache.clone().ok_or_else(|| { - miette::miette!("Proxy mode requires an identity cache (OPA engine must be configured)") - })?; - - // If we have a network namespace, bind to the veth host IP so sandboxed - // processes can reach the proxy via TCP. - #[cfg(target_os = "linux")] - let bind_addr = netns.as_ref().map(|ns| { - let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); - SocketAddr::new(ns.host_ip(), port) - }); - - #[cfg(not(target_os = "linux"))] - let bind_addr: Option = None; - - // Build inference context for local routing of intercepted inference calls. - let inference_ctx = build_inference_context( - sandbox_id.as_deref(), - openshell_endpoint_for_proxy.as_deref(), - inference_routes.as_deref(), - ) - .await?; - - // Create denial aggregator channel if in gRPC mode (sandbox_id present). - // Clone the sender for the bypass monitor before passing to the proxy. - let (denial_tx, denial_rx, bypass_denial_tx) = if sandbox_id.is_some() { - let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); - let bypass_tx = tx.clone(); - (Some(tx), Some(rx), Some(bypass_tx)) - } else { - (None, None, None) - }; - - let proxy_handle = ProxyHandle::start_with_bind_addr( - proxy_policy, - bind_addr, - engine, - cache, - entrypoint_pid.clone(), - tls_state, - inference_ctx, - Some(provider_credentials.clone()), - Some(policy_local_ctx.clone()), - denial_tx, - ) - .await?; - (Some(proxy_handle), denial_rx, bypass_denial_tx) - } else { - (None, None, None) - }; - - // Spawn bypass detection monitor (Linux only, proxy mode only). - // Reads /dev/kmsg for nftables log entries and emits structured - // tracing events for direct connection attempts that bypass the proxy. - #[cfg(target_os = "linux")] - let _bypass_monitor = netns.as_ref().and_then(|ns| { - bypass_monitor::spawn( - ns.name().to_string(), - entrypoint_pid.clone(), - bypass_denial_tx, - ) - }); - - // On non-Linux, bypass_denial_tx is unused (no /dev/kmsg). - #[cfg(not(target_os = "linux"))] - drop(bypass_denial_tx); - - // Compute the proxy URL and netns fd for SSH sessions. - // SSH shell processes need both to enforce network policy: - // - netns_fd: enter the network namespace via setns() so all traffic - // goes through the veth pair (hard enforcement, non-bypassable) - // - proxy_url: set proxy env vars so cooperative tools route through the - // CONNECT proxy; this also opts Node.js into honoring those vars - #[cfg(target_os = "linux")] - let ssh_netns_fd = netns.as_ref().and_then(NetworkNamespace::ns_fd); - - #[cfg(not(target_os = "linux"))] - let ssh_netns_fd: Option = None; - - let ssh_proxy_url = if matches!(policy.network.mode, NetworkMode::Proxy) { - #[cfg(target_os = "linux")] - { - netns.as_ref().map(|ns| { - let port = policy - .network - .proxy - .as_ref() - .and_then(|p| p.http_addr) - .map_or(3128, |addr| addr.port()); - format!("http://{}:{port}", ns.host_ip()) - }) - } - #[cfg(not(target_os = "linux"))] - { - policy - .network - .proxy - .as_ref() - .and_then(|p| p.http_addr) - .map(|addr| format!("http://{addr}")) - } - } else { - None - }; - // Zombie reaper — openshell-sandbox may run as PID 1 in containers and // must reap orphaned grandchildren (e.g. background daemons started by // coding agents) to prevent zombie accumulation. @@ -768,9 +822,9 @@ pub async fn run_sandbox( if let Some(listen_path) = ssh_socket_path.clone() { let policy_clone = policy.clone(); let workdir_clone = workdir.clone(); - let proxy_url = ssh_proxy_url; - let netns_fd = ssh_netns_fd; - let ca_paths = ca_file_paths.clone(); + let proxy_url = networking.ssh_proxy_url.take(); + let netns_fd = networking.ssh_netns_fd; + let ca_paths = networking.ca_file_paths.clone(); let provider_credentials_clone = provider_credentials.clone(); let (ssh_ready_tx, ssh_ready_rx) = tokio::sync::oneshot::channel(); @@ -837,7 +891,12 @@ pub async fn run_sandbox( sandbox_id.as_ref(), ssh_socket_path.as_ref(), ) { - supervisor_session::spawn(endpoint.clone(), id.clone(), socket.clone(), ssh_netns_fd); + supervisor_session::spawn( + endpoint.clone(), + id.clone(), + socket.clone(), + networking.ssh_netns_fd, + ); info!("supervisor session task spawned"); } @@ -848,8 +907,8 @@ pub async fn run_sandbox( workdir.as_deref(), interactive, &policy, - netns.as_ref(), - ca_file_paths.as_ref(), + networking.netns.as_ref(), + networking.ca_file_paths.as_ref(), &provider_env, )?; @@ -860,7 +919,7 @@ pub async fn run_sandbox( workdir.as_deref(), interactive, &policy, - ca_file_paths.as_ref(), + networking.ca_file_paths.as_ref(), &provider_env, )?; @@ -979,7 +1038,7 @@ pub async fn run_sandbox( }); // Spawn denial aggregator (gRPC mode only, when proxy is active). - if let Some(rx) = denial_rx { + if let Some(rx) = networking.denial_rx.take() { // SubmitPolicyAnalysis resolves by sandbox *name*, not UUID. let agg_name = sandbox_name_for_agg.clone().unwrap_or_else(|| id.clone()); let agg_endpoint = endpoint.clone(); From 26e500cf28783dbdc307248ddda8150284ba45e8 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 16:20:06 +0300 Subject: [PATCH 02/49] refactor(sandbox): extract run_process and lift netns to run_sandbox Lifts the post-networking tail of `run_sandbox` (zombie reaper, SSH server, supervisor session, process spawn, OPA probe, policy poll loop, denial aggregator, wait/exit) into a sibling async fn `run_process`. Also moves network namespace creation out of `run_networking` into a new `create_netns_for_proxy` helper invoked from `run_sandbox`, so `run_networking` is purely the proxy component (OPA evaluation, TLS interception, credential injection, inference routing, gRPC control API). The netns is then borrowed into both `run_networking` and `run_process`. No behavior change. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 634 ++++++++++++++++------------ 1 file changed, 353 insertions(+), 281 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 64f9448a9..8bcc1d275 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -282,6 +282,50 @@ fn is_managed_child(pid: i32) -> bool { /// tears down the proxy and bypass-monitor tasks. They must remain alive for /// the duration of the sandbox wait loop, which is achieved by holding the /// returned `Networking` value in `run_sandbox`'s frame. +/// Create the workload's network namespace and install bypass detection +/// rules. Returns `None` when the policy is not in proxy mode. Linux-only. +/// +/// The namespace is shared infrastructure: the proxy binds to its host-side +/// veth IP and reads /dev/kmsg from inside it for bypass detection, while +/// the workload child and SSH sessions enter it via `setns()`. +#[cfg(target_os = "linux")] +fn create_netns_for_proxy(policy: &SandboxPolicy) -> Result> { + if !matches!(policy.network.mode, NetworkMode::Proxy) { + return Ok(None); + } + match NetworkNamespace::create() { + Ok(ns) => { + // Install bypass detection rules (nftables log + reject). + // This provides fast-fail UX and diagnostic logging for direct + // connection attempts that bypass the HTTP CONNECT proxy. + let proxy_port = policy + .network + .proxy + .as_ref() + .and_then(|p| p.http_addr) + .map_or(3128, |addr| addr.port()); + if let Err(e) = ns.install_bypass_rules(proxy_port) { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .state(StateId::Disabled, "degraded") + .message(format!( + "Failed to install bypass detection rules (non-fatal): {e}" + )) + .build() + ); + } + Ok(Some(ns)) + } + Err(e) => Err(miette::miette!( + "Network namespace creation failed and proxy mode requires isolation. \ + Ensure CAP_NET_ADMIN and CAP_SYS_ADMIN are available and iproute2 is installed. \ + Error: {e}" + )), + } +} + struct Networking { #[allow(dead_code, reason = "RAII handle: drop tears down the proxy task")] _proxy: Option, @@ -289,19 +333,22 @@ struct Networking { #[allow(dead_code, reason = "RAII handle: drop joins the bypass monitor task")] _bypass_monitor: Option>, - #[cfg(target_os = "linux")] - netns: Option, ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, ssh_proxy_url: Option, ssh_netns_fd: Option, denial_rx: Option>, } -/// Set up the networking stack: ephemeral CA + TLS state, network namespace, -/// proxy server, bypass monitor, and the SSH-side proxy URL / netns FD. +/// Set up the networking stack: ephemeral CA + TLS state, proxy server, +/// bypass monitor, and the SSH-side proxy URL / netns FD. +/// +/// The network namespace is created by `run_sandbox` and borrowed in here — +/// it is shared infrastructure used by both the proxy (bind address, bypass +/// monitor) and the workload child (entered via `setns()` in `pre_exec`). #[allow(clippy::too_many_arguments)] async fn run_networking( policy: &SandboxPolicy, + #[cfg(target_os = "linux")] netns: Option<&NetworkNamespace>, opa_engine: Option<&Arc>, entrypoint_pid: Arc, provider_credentials: &provider_credentials::ProviderCredentialState, @@ -373,48 +420,6 @@ async fn run_networking( (None, None) }; - // Create network namespace for proxy mode (Linux only). - // This must be created before the proxy AND SSH server so that SSH - // sessions can enter the namespace for network isolation. - #[cfg(target_os = "linux")] - let netns = if matches!(policy.network.mode, NetworkMode::Proxy) { - match NetworkNamespace::create() { - Ok(ns) => { - // Install bypass detection rules (nftables log + reject). - // This provides fast-fail UX and diagnostic logging for direct - // connection attempts that bypass the HTTP CONNECT proxy. - let proxy_port = policy - .network - .proxy - .as_ref() - .and_then(|p| p.http_addr) - .map_or(3128, |addr| addr.port()); - if let Err(e) = ns.install_bypass_rules(proxy_port) { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .state(StateId::Disabled, "degraded") - .message(format!( - "Failed to install bypass detection rules (non-fatal): {e}" - )) - .build() - ); - } - Some(ns) - } - Err(e) => { - return Err(miette::miette!( - "Network namespace creation failed and proxy mode requires isolation. \ - Ensure CAP_NET_ADMIN and CAP_SYS_ADMIN are available and iproute2 is installed. \ - Error: {e}" - )); - } - } - } else { - None - }; - let (proxy_handle, denial_rx, bypass_denial_tx) = if matches!(policy.network.mode, NetworkMode::Proxy) { let proxy_policy = policy.network.proxy.as_ref().ok_or_else(|| { @@ -436,7 +441,7 @@ async fn run_networking( // If we have a network namespace, bind to the veth host IP so sandboxed // processes can reach the proxy via TCP. #[cfg(target_os = "linux")] - let bind_addr = netns.as_ref().map(|ns| { + let bind_addr = netns.map(|ns| { let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); SocketAddr::new(ns.host_ip(), port) }); @@ -480,7 +485,7 @@ async fn run_networking( // Reads /dev/kmsg for nftables log entries and emits structured // tracing events for direct connection attempts that bypass the proxy. #[cfg(target_os = "linux")] - let bypass_monitor_handle = netns.as_ref().and_then(|ns| { + let bypass_monitor_handle = netns.and_then(|ns| { bypass_monitor::spawn( ns.name().to_string(), entrypoint_pid.clone(), @@ -499,7 +504,7 @@ async fn run_networking( // - proxy_url: set proxy env vars so cooperative tools route through the // CONNECT proxy; this also opts Node.js into honoring those vars #[cfg(target_os = "linux")] - let ssh_netns_fd = netns.as_ref().and_then(NetworkNamespace::ns_fd); + let ssh_netns_fd = netns.and_then(NetworkNamespace::ns_fd); #[cfg(not(target_os = "linux"))] let ssh_netns_fd: Option = None; @@ -507,7 +512,7 @@ async fn run_networking( let ssh_proxy_url = if matches!(policy.network.mode, NetworkMode::Proxy) { #[cfg(target_os = "linux")] { - netns.as_ref().map(|ns| { + netns.map(|ns| { let port = policy .network .proxy @@ -534,8 +539,6 @@ async fn run_networking( _proxy: proxy_handle, #[cfg(target_os = "linux")] _bypass_monitor: bypass_monitor_handle, - #[cfg(target_os = "linux")] - netns, ca_file_paths, ssh_proxy_url, ssh_netns_fd, @@ -543,214 +546,39 @@ async fn run_networking( }) } -/// Run a command in the sandbox. +/// Run the sandbox workload: spawn the zombie reaper, the SSH server, the +/// supervisor session, the entrypoint process, the OPA symlink probe, and the +/// policy poll loop / denial aggregator; then wait for the entrypoint with an +/// optional timeout and emit the exit OCSF event. /// -/// # Errors -/// -/// Returns an error if the command fails to start or encounters a fatal error. +/// Networking outputs (`ssh_proxy_url`, `ssh_netns_fd`, `ca_file_paths`, +/// `netns`, `denial_rx`) are passed in individually so that this fn does not +/// depend on the `Networking` struct directly. #[allow(clippy::too_many_arguments, clippy::similar_names)] -pub async fn run_sandbox( - command: Vec, - workdir: Option, +async fn run_process( + program: &str, + args: &[String], + workdir: Option<&str>, timeout_secs: u64, interactive: bool, - sandbox_id: Option, - sandbox: Option, - openshell_endpoint: Option, - policy_rules: Option, - policy_data: Option, + sandbox_id: Option<&str>, + sandbox_name_for_agg: Option<&str>, + openshell_endpoint: Option<&str>, ssh_socket_path: Option, - _health_check: bool, - _health_port: u16, - inference_routes: Option, + policy: &SandboxPolicy, + opa_engine: Option<&Arc>, + retained_proto: Option<&openshell_core::proto::SandboxPolicy>, + entrypoint_pid: Arc, + provider_credentials: provider_credentials::ProviderCredentialState, + provider_env: std::collections::HashMap, + policy_local_ctx: Arc, ocsf_enabled: Arc, + ssh_proxy_url: Option, + ssh_netns_fd: Option, + ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, + #[cfg(target_os = "linux")] netns: Option<&NetworkNamespace>, + denial_rx: Option>, ) -> Result { - let (program, args) = command - .split_first() - .ok_or_else(|| miette::miette!("No command specified"))?; - - // Initialize the process-wide OCSF context early so that events emitted - // during policy loading (filesystem config, validation) have a context. - // Proxy IP/port use defaults here; they are only significant for network - // events which happen after the netns is created. - { - let hostname = std::fs::read_to_string("/etc/hostname").map_or_else( - |_| "openshell-sandbox".to_string(), - |s| s.trim().to_string(), - ); - - if OCSF_CTX - .set(SandboxContext { - sandbox_id: sandbox_id.clone().unwrap_or_default(), - sandbox_name: sandbox.as_deref().unwrap_or_default().to_string(), - container_image: std::env::var("OPENSHELL_CONTAINER_IMAGE").unwrap_or_default(), - hostname, - product_version: openshell_core::VERSION.to_string(), - proxy_ip: std::net::IpAddr::from([127, 0, 0, 1]), - proxy_port: 3128, - }) - .is_err() - { - debug!("OCSF context already initialized, keeping existing"); - } - } - - // Load policy and initialize OPA engine - let openshell_endpoint_for_proxy = openshell_endpoint.clone(); - let sandbox_name_for_agg = sandbox.clone(); - let (policy, opa_engine, retained_proto) = load_policy( - sandbox_id.clone(), - sandbox, - openshell_endpoint.clone(), - policy_rules, - policy_data, - ) - .await?; - let policy_local_ctx = Arc::new(policy_local::PolicyLocalContext::new( - retained_proto.clone(), - openshell_endpoint.clone(), - sandbox_name_for_agg.clone().or_else(|| sandbox_id.clone()), - )); - - // Validate that the required "sandbox" user exists in this image. - // All sandbox images must include this user for privilege dropping. - #[cfg(unix)] - validate_sandbox_user(&policy)?; - - // Fetch provider environment variables from the server. - // This is done after loading the policy so the sandbox can still start - // even if provider env fetch fails (graceful degradation). - let (provider_env_revision, provider_env, provider_credential_expires_at_ms) = - if let (Some(id), Some(endpoint)) = (&sandbox_id, &openshell_endpoint) { - match grpc_client::fetch_provider_environment(endpoint, id).await { - Ok(result) => { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Enabled, "loaded") - .message(format!( - "Fetched provider environment [env_count:{}]", - result.environment.len() - )) - .build() - ); - ( - result.provider_env_revision, - result.environment, - result.credential_expires_at_ms, - ) - } - Err(e) => { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .state(StateId::Other, "degraded") - .message(format!( - "Failed to fetch provider environment, continuing without: {e}" - )) - .build() - ); - ( - 0, - std::collections::HashMap::new(), - std::collections::HashMap::new(), - ) - } - } - } else { - ( - 0, - std::collections::HashMap::new(), - std::collections::HashMap::new(), - ) - }; - - let provider_credentials = provider_credentials::ProviderCredentialState::from_environment( - provider_env_revision, - provider_env, - provider_credential_expires_at_ms, - ); - let provider_env = provider_credentials.snapshot().child_env.clone(); - - // Prepare filesystem: create and chown read_write directories - prepare_filesystem(&policy)?; - - #[cfg(target_os = "linux")] - { - let pid_limit_mode = if std::env::var_os("OPENSHELL_REQUIRE_RUNTIME_PID_LIMIT").is_some() { - process::RuntimePidLimitMode::Require - } else { - process::RuntimePidLimitMode::Warn - }; - process::check_runtime_pid_limit(pid_limit_mode)?; - } - - // Initialize the agent-proposals feature flag. Default false until the - // initial settings fetch (or the poll loop) tells us otherwise. The flag - // gates the skill install, the policy.local route handler, and the L7 - // deny body's `next_steps` field — see `agent_proposals_enabled()`. - let proposals_enabled = Arc::new(std::sync::atomic::AtomicBool::new(false)); - if AGENT_PROPOSALS_ENABLED - .set(proposals_enabled.clone()) - .is_err() - { - debug!("agent proposals flag already initialized, keeping existing"); - } - - // Eagerly fetch the initial settings so skill install can honor the flag - // at startup rather than waiting for the poll loop's first tick. In - // offline/file-mode there is no gateway, so the flag stays false. - if let (Some(id), Some(endpoint)) = (&sandbox_id, &openshell_endpoint) - && let Ok(client) = grpc_client::CachedOpenShellClient::connect(endpoint).await - && let Ok(result) = client.poll_settings(id).await - { - let initial = extract_bool_setting( - &result.settings, - openshell_core::settings::AGENT_POLICY_PROPOSALS_ENABLED_KEY, - ) - .unwrap_or(false); - proposals_enabled.store(initial, Ordering::Relaxed); - } - - if agent_proposals_enabled() { - match skills::install_static_skills() { - Ok(installed) => { - info!( - path = %installed.policy_advisor.display(), - "Installed sandbox agent skill" - ); - } - Err(error) => { - warn!(error = %error, "Failed to install sandbox agent skill"); - } - } - } else { - debug!("agent_policy_proposals_enabled is false at startup; skipping skill install"); - } - - // Shared PID: set after process spawn so the proxy can look up - // the entrypoint process's /proc/net/tcp for identity binding. - let entrypoint_pid = Arc::new(AtomicU32::new(0)); - - let mut networking = run_networking( - &policy, - opa_engine.as_ref(), - entrypoint_pid.clone(), - &provider_credentials, - &policy_local_ctx, - sandbox_id.as_deref(), - openshell_endpoint_for_proxy.as_deref(), - inference_routes.as_deref(), - ) - .await?; - - // Install the supervisor seccomp prelude after privileged startup helpers - // (network namespace setup, nftables probes) complete, but before the SSH - // listener and workload process are exposed. - apply_supervisor_startup_hardening()?; - // Zombie reaper — openshell-sandbox may run as PID 1 in containers and // must reap orphaned grandchildren (e.g. background daemons started by // coding agents) to prevent zombie accumulation. @@ -821,10 +649,10 @@ pub async fn run_sandbox( let ssh_socket_path: Option = ssh_socket_path.map(std::path::PathBuf::from); if let Some(listen_path) = ssh_socket_path.clone() { let policy_clone = policy.clone(); - let workdir_clone = workdir.clone(); - let proxy_url = networking.ssh_proxy_url.take(); - let netns_fd = networking.ssh_netns_fd; - let ca_paths = networking.ca_file_paths.clone(); + let workdir_clone = workdir.map(str::to_string); + let proxy_url = ssh_proxy_url; + let netns_fd = ssh_netns_fd; + let ca_paths = ca_file_paths.clone(); let provider_credentials_clone = provider_credentials.clone(); let (ssh_ready_tx, ssh_ready_rx) = tokio::sync::oneshot::channel(); @@ -886,16 +714,14 @@ pub async fn run_sandbox( // Spawn the persistent supervisor session if we have a gateway endpoint // and sandbox identity. The session provides relay channels for SSH // connect and ExecSandbox through the gateway. - if let (Some(endpoint), Some(id), Some(socket)) = ( - openshell_endpoint.as_ref(), - sandbox_id.as_ref(), - ssh_socket_path.as_ref(), - ) { + if let (Some(endpoint), Some(id), Some(socket)) = + (openshell_endpoint, sandbox_id, ssh_socket_path.as_ref()) + { supervisor_session::spawn( - endpoint.clone(), - id.clone(), + endpoint.to_string(), + id.to_string(), socket.clone(), - networking.ssh_netns_fd, + ssh_netns_fd, ); info!("supervisor session task spawned"); } @@ -904,11 +730,11 @@ pub async fn run_sandbox( let mut handle = ProcessHandle::spawn( program, args, - workdir.as_deref(), + workdir, interactive, - &policy, - networking.netns.as_ref(), - networking.ca_file_paths.as_ref(), + policy, + netns, + ca_file_paths.as_ref(), &provider_env, )?; @@ -916,10 +742,10 @@ pub async fn run_sandbox( let mut handle = ProcessHandle::spawn( program, args, - workdir.as_deref(), + workdir, interactive, - &policy, - networking.ca_file_paths.as_ref(), + policy, + ca_file_paths.as_ref(), &provider_env, )?; @@ -947,7 +773,7 @@ pub async fn run_sandbox( // just been spawned and its mount namespace / procfs entries may not // be fully populated yet. Instead, we probe with retries until // /proc//root/ is accessible or we exhaust attempts. - if let (Some(engine), Some(proto)) = (&opa_engine, &retained_proto) { + if let (Some(engine), Some(proto)) = (opa_engine, retained_proto) { let resolve_engine = engine.clone(); let resolve_proto = proto.clone(); let resolve_pid = entrypoint_pid.clone(); @@ -999,11 +825,9 @@ pub async fn run_sandbox( } // Spawn background policy poll task (gRPC mode only). - if let (Some(id), Some(endpoint), Some(engine)) = - (&sandbox_id, &openshell_endpoint, &opa_engine) - { - let poll_id = id.clone(); - let poll_endpoint = endpoint.clone(); + if let (Some(id), Some(endpoint), Some(engine)) = (sandbox_id, openshell_endpoint, opa_engine) { + let poll_id = id.to_string(); + let poll_endpoint = endpoint.to_string(); let poll_engine = engine.clone(); let poll_ocsf_enabled = ocsf_enabled.clone(); let poll_pid = entrypoint_pid.clone(); @@ -1038,10 +862,10 @@ pub async fn run_sandbox( }); // Spawn denial aggregator (gRPC mode only, when proxy is active). - if let Some(rx) = networking.denial_rx.take() { + if let Some(rx) = denial_rx { // SubmitPolicyAnalysis resolves by sandbox *name*, not UUID. - let agg_name = sandbox_name_for_agg.clone().unwrap_or_else(|| id.clone()); - let agg_endpoint = endpoint.clone(); + let agg_name = sandbox_name_for_agg.map_or_else(|| id.to_string(), str::to_string); + let agg_endpoint = endpoint.to_string(); let flush_interval_secs: u64 = std::env::var("OPENSHELL_DENIAL_FLUSH_INTERVAL_SECS") .ok() .and_then(|v| v.parse().ok()) @@ -1107,6 +931,254 @@ pub async fn run_sandbox( Ok(status.code()) } +/// Run a command in the sandbox. +/// +/// # Errors +/// +/// Returns an error if the command fails to start or encounters a fatal error. +#[allow(clippy::too_many_arguments, clippy::similar_names)] +pub async fn run_sandbox( + command: Vec, + workdir: Option, + timeout_secs: u64, + interactive: bool, + sandbox_id: Option, + sandbox: Option, + openshell_endpoint: Option, + policy_rules: Option, + policy_data: Option, + ssh_socket_path: Option, + _health_check: bool, + _health_port: u16, + inference_routes: Option, + ocsf_enabled: Arc, +) -> Result { + let (program, args) = command + .split_first() + .ok_or_else(|| miette::miette!("No command specified"))?; + + // Initialize the process-wide OCSF context early so that events emitted + // during policy loading (filesystem config, validation) have a context. + // Proxy IP/port use defaults here; they are only significant for network + // events which happen after the netns is created. + { + let hostname = std::fs::read_to_string("/etc/hostname").map_or_else( + |_| "openshell-sandbox".to_string(), + |s| s.trim().to_string(), + ); + + if OCSF_CTX + .set(SandboxContext { + sandbox_id: sandbox_id.clone().unwrap_or_default(), + sandbox_name: sandbox.as_deref().unwrap_or_default().to_string(), + container_image: std::env::var("OPENSHELL_CONTAINER_IMAGE").unwrap_or_default(), + hostname, + product_version: openshell_core::VERSION.to_string(), + proxy_ip: std::net::IpAddr::from([127, 0, 0, 1]), + proxy_port: 3128, + }) + .is_err() + { + debug!("OCSF context already initialized, keeping existing"); + } + } + + // Load policy and initialize OPA engine + let openshell_endpoint_for_proxy = openshell_endpoint.clone(); + let sandbox_name_for_agg = sandbox.clone(); + let (policy, opa_engine, retained_proto) = load_policy( + sandbox_id.clone(), + sandbox, + openshell_endpoint.clone(), + policy_rules, + policy_data, + ) + .await?; + let policy_local_ctx = Arc::new(policy_local::PolicyLocalContext::new( + retained_proto.clone(), + openshell_endpoint.clone(), + sandbox_name_for_agg.clone().or_else(|| sandbox_id.clone()), + )); + + // Validate that the required "sandbox" user exists in this image. + // All sandbox images must include this user for privilege dropping. + #[cfg(unix)] + validate_sandbox_user(&policy)?; + + // Fetch provider environment variables from the server. + // This is done after loading the policy so the sandbox can still start + // even if provider env fetch fails (graceful degradation). + let (provider_env_revision, provider_env, provider_credential_expires_at_ms) = + if let (Some(id), Some(endpoint)) = (&sandbox_id, &openshell_endpoint) { + match grpc_client::fetch_provider_environment(endpoint, id).await { + Ok(result) => { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Enabled, "loaded") + .message(format!( + "Fetched provider environment [env_count:{}]", + result.environment.len() + )) + .build() + ); + ( + result.provider_env_revision, + result.environment, + result.credential_expires_at_ms, + ) + } + Err(e) => { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .state(StateId::Other, "degraded") + .message(format!( + "Failed to fetch provider environment, continuing without: {e}" + )) + .build() + ); + ( + 0, + std::collections::HashMap::new(), + std::collections::HashMap::new(), + ) + } + } + } else { + ( + 0, + std::collections::HashMap::new(), + std::collections::HashMap::new(), + ) + }; + + let provider_credentials = provider_credentials::ProviderCredentialState::from_environment( + provider_env_revision, + provider_env, + provider_credential_expires_at_ms, + ); + let provider_env = provider_credentials.snapshot().child_env.clone(); + + // Prepare filesystem: create and chown read_write directories + prepare_filesystem(&policy)?; + + #[cfg(target_os = "linux")] + { + let pid_limit_mode = if std::env::var_os("OPENSHELL_REQUIRE_RUNTIME_PID_LIMIT").is_some() { + process::RuntimePidLimitMode::Require + } else { + process::RuntimePidLimitMode::Warn + }; + process::check_runtime_pid_limit(pid_limit_mode)?; + } + + // Initialize the agent-proposals feature flag. Default false until the + // initial settings fetch (or the poll loop) tells us otherwise. The flag + // gates the skill install, the policy.local route handler, and the L7 + // deny body's `next_steps` field — see `agent_proposals_enabled()`. + let proposals_enabled = Arc::new(std::sync::atomic::AtomicBool::new(false)); + if AGENT_PROPOSALS_ENABLED + .set(proposals_enabled.clone()) + .is_err() + { + debug!("agent proposals flag already initialized, keeping existing"); + } + + // Eagerly fetch the initial settings so skill install can honor the flag + // at startup rather than waiting for the poll loop's first tick. In + // offline/file-mode there is no gateway, so the flag stays false. + if let (Some(id), Some(endpoint)) = (&sandbox_id, &openshell_endpoint) + && let Ok(client) = grpc_client::CachedOpenShellClient::connect(endpoint).await + && let Ok(result) = client.poll_settings(id).await + { + let initial = extract_bool_setting( + &result.settings, + openshell_core::settings::AGENT_POLICY_PROPOSALS_ENABLED_KEY, + ) + .unwrap_or(false); + proposals_enabled.store(initial, Ordering::Relaxed); + } + + if agent_proposals_enabled() { + match skills::install_static_skills() { + Ok(installed) => { + info!( + path = %installed.policy_advisor.display(), + "Installed sandbox agent skill" + ); + } + Err(error) => { + warn!(error = %error, "Failed to install sandbox agent skill"); + } + } + } else { + debug!("agent_policy_proposals_enabled is false at startup; skipping skill install"); + } + + // Shared PID: set after process spawn so the proxy can look up + // the entrypoint process's /proc/net/tcp for identity binding. + let entrypoint_pid = Arc::new(AtomicU32::new(0)); + + // Create the workload's network namespace. It is shared infrastructure: + // the proxy binds to its host-side veth IP, the bypass monitor reads + // /dev/kmsg from inside it, and the workload child / SSH sessions enter + // it via setns(). The RAII handle lives in this frame for the duration + // of the sandbox. + #[cfg(target_os = "linux")] + let netns = create_netns_for_proxy(&policy)?; + + let mut networking = run_networking( + &policy, + #[cfg(target_os = "linux")] + netns.as_ref(), + opa_engine.as_ref(), + entrypoint_pid.clone(), + &provider_credentials, + &policy_local_ctx, + sandbox_id.as_deref(), + openshell_endpoint_for_proxy.as_deref(), + inference_routes.as_deref(), + ) + .await?; + + // Install the supervisor seccomp prelude after privileged startup helpers + // (network namespace setup, nftables probes) complete, but before the SSH + // listener and workload process are exposed. + apply_supervisor_startup_hardening()?; + + let exit_code = run_process( + program, + args, + workdir.as_deref(), + timeout_secs, + interactive, + sandbox_id.as_deref(), + sandbox_name_for_agg.as_deref(), + openshell_endpoint.as_deref(), + ssh_socket_path, + &policy, + opa_engine.as_ref(), + retained_proto.as_ref(), + entrypoint_pid, + provider_credentials, + provider_env, + policy_local_ctx, + ocsf_enabled, + networking.ssh_proxy_url.take(), + networking.ssh_netns_fd, + networking.ca_file_paths.clone(), + #[cfg(target_os = "linux")] + netns.as_ref(), + networking.denial_rx.take(), + ) + .await?; + + Ok(exit_code) +} + /// Build an inference context for local routing, if route sources are available. /// /// Route sources (in priority order): From 228d5a76fc3a717007517d55208f005247e8ca00 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 17:56:59 +0300 Subject: [PATCH 03/49] chore(workspace): scaffold openshell-supervisor-networking and openshell-supervisor-process crates Add empty placeholder crates that subsequent commits will populate as the sandbox decomposition proceeds. Both crates compile clean as part of the workspace and are picked up automatically by the existing `members = ["crates/*"]` glob. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 8 ++++++++ .../openshell-supervisor-networking/Cargo.toml | 18 ++++++++++++++++++ .../openshell-supervisor-networking/src/lib.rs | 8 ++++++++ crates/openshell-supervisor-process/Cargo.toml | 18 ++++++++++++++++++ crates/openshell-supervisor-process/src/lib.rs | 9 +++++++++ 5 files changed, 61 insertions(+) create mode 100644 crates/openshell-supervisor-networking/Cargo.toml create mode 100644 crates/openshell-supervisor-networking/src/lib.rs create mode 100644 crates/openshell-supervisor-process/Cargo.toml create mode 100644 crates/openshell-supervisor-process/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index ad7efabc9..22f02198b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3763,6 +3763,14 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "openshell-supervisor-networking" +version = "0.0.0" + +[[package]] +name = "openshell-supervisor-process" +version = "0.0.0" + [[package]] name = "openshell-tui" version = "0.0.0" diff --git a/crates/openshell-supervisor-networking/Cargo.toml b/crates/openshell-supervisor-networking/Cargo.toml new file mode 100644 index 000000000..8d3e110e7 --- /dev/null +++ b/crates/openshell-supervisor-networking/Cargo.toml @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[package] +name = "openshell-supervisor-networking" +description = "Networking component of the OpenShell supervisor: proxy, L7 enforcement, OPA, inference routing, denial aggregator" +version.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true + +[dependencies] + +[dev-dependencies] + +[lints] +workspace = true diff --git a/crates/openshell-supervisor-networking/src/lib.rs b/crates/openshell-supervisor-networking/src/lib.rs new file mode 100644 index 000000000..4dcda101c --- /dev/null +++ b/crates/openshell-supervisor-networking/src/lib.rs @@ -0,0 +1,8 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Networking component of the OpenShell supervisor. +//! +//! Owns the egress proxy, L7 enforcement, OPA policy engine, identity cache, +//! inference routing, TLS interception, and denial aggregation. Populated by +//! follow-up commits as modules migrate out of `openshell-sandbox`. diff --git a/crates/openshell-supervisor-process/Cargo.toml b/crates/openshell-supervisor-process/Cargo.toml new file mode 100644 index 000000000..faf379f51 --- /dev/null +++ b/crates/openshell-supervisor-process/Cargo.toml @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[package] +name = "openshell-supervisor-process" +description = "Process component of the OpenShell supervisor: entrypoint spawn, SSH server, supervisor session, netns, bypass monitor" +version.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true + +[dependencies] + +[dev-dependencies] + +[lints] +workspace = true diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs new file mode 100644 index 000000000..4ba7d4871 --- /dev/null +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Process component of the OpenShell supervisor. +//! +//! Owns the entrypoint process spawn, SSH server, supervisor session, network +//! namespace, bypass monitor, child environment construction, skills install, +//! and log push. Populated by follow-up commits as modules migrate out of +//! `openshell-sandbox`. From b045ef7137881d28442e81a759814a29a72daf97 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 17:57:14 +0300 Subject: [PATCH 04/49] refactor(core): lift DenialEvent to openshell-core The DenialEvent struct is emitted by both the proxy/L7 layer (networking-side) and the bypass monitor (process-side), and crosses the run_networking -> run_process API boundary. Move it to openshell-core so the eventual supervisor-networking and supervisor-process crates can both reference it without depending on each other. DenialAggregator and the channel/flush helpers stay in openshell-sandbox for now. A thin `pub use openshell_core::DenialEvent;` re-export from denial_aggregator.rs keeps every existing `crate::denial_aggregator::DenialEvent` call site resolving without further edits. Signed-off-by: Radoslav Hubenov --- crates/openshell-core/src/denial.rs | 32 +++++++++++++++++++ crates/openshell-core/src/lib.rs | 2 ++ .../src/denial_aggregator.rs | 21 +----------- 3 files changed, 35 insertions(+), 20 deletions(-) create mode 100644 crates/openshell-core/src/denial.rs diff --git a/crates/openshell-core/src/denial.rs b/crates/openshell-core/src/denial.rs new file mode 100644 index 000000000..4f610f6e3 --- /dev/null +++ b/crates/openshell-core/src/denial.rs @@ -0,0 +1,32 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Cross-component denial event type. +//! +//! `DenialEvent` is emitted by the supervisor's networking proxy (on L4/L7 +//! deny) and by the bypass monitor (on direct-connect attempts that bypass +//! the proxy). It is consumed by the networking-side denial aggregator that +//! deduplicates and flushes summaries to the gateway. The type lives in +//! `openshell-core` so that the eventual networking and process supervisor +//! crates can both reference it without depending on each other. + +/// A single denial event emitted by the proxy or the bypass monitor. +#[derive(Debug, Clone)] +pub struct DenialEvent { + /// Destination host that was denied. + pub host: String, + /// Destination port that was denied. + pub port: u16, + /// Binary path that initiated the connection (if resolved). + pub binary: String, + /// Ancestor binary paths from process tree walk. + pub ancestors: Vec, + /// Reason for denial (e.g. "no matching policy", "internal address"). + pub deny_reason: String, + /// Denial stage: "connect", "forward", "ssrf", "l7", "bypass". + pub denial_stage: String, + /// L7 request method (if this is an L7 denial). + pub l7_method: Option, + /// L7 target path. + pub l7_path: Option, +} diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index 17548ad1a..52cdba5c4 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -11,6 +11,7 @@ pub mod auth; pub mod config; +pub mod denial; pub mod driver_utils; pub mod error; pub mod forward; @@ -30,6 +31,7 @@ pub use config::{ ComputeDriverKind, Config, GatewayAuthConfig, GatewayJwtConfig, MtlsAuthConfig, OidcConfig, TlsConfig, }; +pub use denial::DenialEvent; pub use error::{ComputeDriverError, Error, Result}; pub use metadata::{GetResourceVersion, ObjectId, ObjectLabels, ObjectName, SetResourceVersion}; diff --git a/crates/openshell-sandbox/src/denial_aggregator.rs b/crates/openshell-sandbox/src/denial_aggregator.rs index 5d41adffd..648095a23 100644 --- a/crates/openshell-sandbox/src/denial_aggregator.rs +++ b/crates/openshell-sandbox/src/denial_aggregator.rs @@ -14,26 +14,7 @@ use std::future::Future; use tokio::sync::mpsc; use tracing::debug; -/// A single denial event emitted by the proxy. -#[derive(Debug, Clone)] -pub struct DenialEvent { - /// Destination host that was denied. - pub host: String, - /// Destination port that was denied. - pub port: u16, - /// Binary path that initiated the connection (if resolved). - pub binary: String, - /// Ancestor binary paths from process tree walk. - pub ancestors: Vec, - /// Reason for denial (e.g. "no matching policy", "internal address"). - pub deny_reason: String, - /// Denial stage: "connect", "forward", "ssrf", "l7", "bypass". - pub denial_stage: String, - /// L7 request details (method, path, decision) if this is an L7 denial. - pub l7_method: Option, - /// L7 target path. - pub l7_path: Option, -} +pub use openshell_core::DenialEvent; /// Aggregated denial summary keyed by `(host, port, binary)`. #[derive(Debug, Clone)] From 055054246eef44a33c175c16814d16c01444487a Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 18:30:13 +0300 Subject: [PATCH 05/49] refactor(core): lift normalize_path to openshell-core Move the lexical path-normalization helper from openshell-policy to openshell-core::paths so it can be reached from crates that sit below openshell-policy in the dependency graph. openshell-policy keeps its existing public API via a `pub use` re-export, so all current call sites (e.g. openshell-sandbox/src/policy.rs) continue to resolve unchanged. This is a prerequisite for lifting openshell-sandbox/src/policy.rs into openshell-core: that file's `From` impl calls normalize_path, and lifting it as-is would cycle through openshell-policy. Signed-off-by: Radoslav Hubenov --- crates/openshell-core/src/paths.rs | 46 ++++++++++++++++++++++++++++-- crates/openshell-policy/src/lib.rs | 24 +++------------- 2 files changed, 48 insertions(+), 22 deletions(-) diff --git a/crates/openshell-core/src/paths.rs b/crates/openshell-core/src/paths.rs index 65000c6cf..9445347c7 100644 --- a/crates/openshell-core/src/paths.rs +++ b/crates/openshell-core/src/paths.rs @@ -1,12 +1,14 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! Centralized XDG config directory resolution and permission helpers. +//! Path utilities: XDG config directory resolution, permission helpers, and +//! lexical path normalization. //! //! All `OpenShell` crates should use [`xdg_config_dir`] from this module instead //! of reimplementing the XDG lookup. The permission helpers ensure that //! sensitive files (private keys, tokens) and the directories containing them -//! are created with restrictive modes. +//! are created with restrictive modes. [`normalize_path`] performs purely +//! lexical normalization (no filesystem access, no symlink resolution). use miette::{IntoDiagnostic, Result, WrapErr}; use std::path::{Path, PathBuf}; @@ -126,6 +128,33 @@ pub fn is_file_permissions_too_open(path: &Path) -> bool { std::fs::metadata(path).is_ok_and(|m| m.permissions().mode() & 0o077 != 0) } +/// Normalize a filesystem path by collapsing redundant separators +/// and removing trailing slashes, without requiring the path to exist on disk. +/// +/// This is a lexical normalization only — it does NOT resolve symlinks or +/// check the filesystem. `..` components are preserved verbatim; callers that +/// need to reject parent traversal must validate separately. +pub fn normalize_path(path: &str) -> String { + use std::path::Component; + + let p = Path::new(path); + let mut normalized = PathBuf::new(); + for component in p.components() { + match component { + Component::Prefix(prefix) => normalized.push(prefix.as_os_str()), + #[allow(clippy::path_buf_push_overwrite)] + Component::RootDir => normalized.push("/"), + Component::CurDir => {} // skip "." + Component::ParentDir => { + // Keep ".." — validation will catch it separately + normalized.push(".."); + } + Component::Normal(c) => normalized.push(c), + } + } + normalized.to_string_lossy().to_string() +} + #[cfg(test)] mod tests { use super::*; @@ -201,4 +230,17 @@ mod tests { std::fs::set_permissions(&file, std::fs::Permissions::from_mode(0o600)).unwrap(); assert!(!is_file_permissions_too_open(&file)); } + + #[test] + fn normalize_path_collapses_separators() { + assert_eq!(normalize_path("/usr//lib"), "/usr/lib"); + assert_eq!(normalize_path("/usr/./lib"), "/usr/lib"); + assert_eq!(normalize_path("/tmp/"), "/tmp"); + } + + #[test] + fn normalize_path_preserves_parent_dir() { + // normalize_path preserves ".." — validation catches it separately + assert_eq!(normalize_path("/usr/../etc"), "/usr/../etc"); + } } diff --git a/crates/openshell-policy/src/lib.rs b/crates/openshell-policy/src/lib.rs index 26c8fc9d3..aaabbf926 100644 --- a/crates/openshell-policy/src/lib.rs +++ b/crates/openshell-policy/src/lib.rs @@ -858,26 +858,10 @@ fn truncate_for_display(s: &str) -> String { /// /// This is a lexical normalization only — it does NOT resolve symlinks or /// check the filesystem. -pub fn normalize_path(path: &str) -> String { - use std::path::Component; - - let p = Path::new(path); - let mut normalized = std::path::PathBuf::new(); - for component in p.components() { - match component { - Component::Prefix(prefix) => normalized.push(prefix.as_os_str()), - #[allow(clippy::path_buf_push_overwrite)] - Component::RootDir => normalized.push("/"), - Component::CurDir => {} // skip "." - Component::ParentDir => { - // Keep ".." — validation will catch it separately - normalized.push(".."); - } - Component::Normal(c) => normalized.push(c), - } - } - normalized.to_string_lossy().to_string() -} +/// +/// Re-exported from `openshell-core` so existing call sites +/// (`openshell_policy::normalize_path`) keep resolving. +pub use openshell_core::paths::normalize_path; // --------------------------------------------------------------------------- // Tests From ce0c9b46b131e5562bfff49ce885cbd49a8e61fb Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 18:37:55 +0300 Subject: [PATCH 06/49] refactor(core): lift SandboxPolicy and friends to openshell-core Move openshell-sandbox/src/policy.rs (SandboxPolicy, NetworkPolicy, ProxyPolicy, FilesystemPolicy, LandlockPolicy, ProcessPolicy, NetworkMode, LandlockCompatibility, plus their Proto* TryFrom/From impls) to openshell-core/src/policy.rs. Both prospective supervisor leaves (networking and process) dispatch on SandboxPolicy. Hosting it in openshell-core lets either leaf reach for it without depending on the other (or on the future orchestrator). The From impl now calls the in-crate openshell_core::paths::normalize_path lifted in the previous commit, which is what made this move cycle-free. Update all crate::policy::* call sites in openshell-sandbox to openshell_core::policy::*. Signed-off-by: Radoslav Hubenov --- crates/openshell-core/src/lib.rs | 1 + crates/{openshell-sandbox => openshell-core}/src/policy.rs | 7 ++++--- crates/openshell-sandbox/src/lib.rs | 7 +++---- crates/openshell-sandbox/src/opa.rs | 2 +- crates/openshell-sandbox/src/process.rs | 4 ++-- crates/openshell-sandbox/src/proxy.rs | 2 +- crates/openshell-sandbox/src/sandbox/linux/landlock.rs | 2 +- crates/openshell-sandbox/src/sandbox/linux/mod.rs | 4 ++-- crates/openshell-sandbox/src/sandbox/linux/seccomp.rs | 2 +- crates/openshell-sandbox/src/sandbox/mod.rs | 2 +- crates/openshell-sandbox/src/ssh.rs | 4 ++-- 11 files changed, 19 insertions(+), 18 deletions(-) rename crates/{openshell-sandbox => openshell-core}/src/policy.rs (95%) diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index 52cdba5c4..681cf9e99 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -21,6 +21,7 @@ pub mod inference; pub mod metadata; pub mod net; pub mod paths; +pub mod policy; pub mod progress; pub mod proto; pub mod sandbox_env; diff --git a/crates/openshell-sandbox/src/policy.rs b/crates/openshell-core/src/policy.rs similarity index 95% rename from crates/openshell-sandbox/src/policy.rs rename to crates/openshell-core/src/policy.rs index 0827fa0d0..1645b9da4 100644 --- a/crates/openshell-sandbox/src/policy.rs +++ b/crates/openshell-core/src/policy.rs @@ -3,7 +3,8 @@ //! Sandbox policy configuration. -use openshell_core::proto::{ +use crate::paths::normalize_path; +use crate::proto::{ FilesystemPolicy as ProtoFilesystemPolicy, LandlockPolicy as ProtoLandlockPolicy, ProcessPolicy as ProtoProcessPolicy, SandboxPolicy as ProtoSandboxPolicy, }; @@ -125,12 +126,12 @@ impl From for FilesystemPolicy { read_only: proto .read_only .into_iter() - .map(|p| PathBuf::from(openshell_policy::normalize_path(&p))) + .map(|p| PathBuf::from(normalize_path(&p))) .collect(), read_write: proto .read_write .into_iter() - .map(|p| PathBuf::from(openshell_policy::normalize_path(&p))) + .map(|p| PathBuf::from(normalize_path(&p))) .collect(), include_workdir: proto.include_workdir, } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 8bcc1d275..b7e889d70 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -15,7 +15,6 @@ pub mod l7; pub mod log_push; pub mod mechanistic_mapper; pub mod opa; -mod policy; mod policy_local; mod process; pub mod procfs; @@ -174,8 +173,8 @@ use crate::l7::tls::{ write_ca_files, }; use crate::opa::OpaEngine; -use crate::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; use crate::proxy::ProxyHandle; +use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; #[cfg(target_os = "linux")] use crate::sandbox::linux::netns::NetworkNamespace; pub use process::{ProcessHandle, ProcessStatus}; @@ -1799,7 +1798,7 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { )] mod baseline_tests { use super::*; - use crate::policy::{FilesystemPolicy, LandlockPolicy, ProcessPolicy}; + use openshell_core::policy::{FilesystemPolicy, LandlockPolicy, ProcessPolicy}; #[test] fn proc_not_in_both_read_only_and_read_write_when_gpu_present() { @@ -2843,7 +2842,7 @@ fn format_setting_value(es: &openshell_core::proto::EffectiveSetting) -> String )] mod tests { use super::*; - use crate::policy::{FilesystemPolicy, LandlockPolicy, ProcessPolicy}; + use openshell_core::policy::{FilesystemPolicy, LandlockPolicy, ProcessPolicy}; #[cfg(unix)] use nix::unistd::{Group, User}; #[cfg(unix)] diff --git a/crates/openshell-sandbox/src/opa.rs b/crates/openshell-sandbox/src/opa.rs index f73f3bc14..132d8869d 100644 --- a/crates/openshell-sandbox/src/opa.rs +++ b/crates/openshell-sandbox/src/opa.rs @@ -7,8 +7,8 @@ //! access decisions. The engine is loaded once at sandbox startup and queried //! on every proxy CONNECT request. -use crate::policy::{FilesystemPolicy, LandlockCompatibility, LandlockPolicy, ProcessPolicy}; use miette::Result; +use openshell_core::policy::{FilesystemPolicy, LandlockCompatibility, LandlockPolicy, ProcessPolicy}; use openshell_core::proto::SandboxPolicy as ProtoSandboxPolicy; use std::path::{Path, PathBuf}; use std::sync::{ diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index 76786a84d..ce32402a8 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -4,10 +4,10 @@ //! Process management and signal handling. use crate::child_env; -use crate::policy::{NetworkMode, SandboxPolicy}; use crate::sandbox; #[cfg(target_os = "linux")] use crate::sandbox::linux::netns::NetworkNamespace; +use openshell_core::policy::{NetworkMode, SandboxPolicy}; #[cfg(target_os = "linux")] use crate::{register_managed_child, unregister_managed_child}; use miette::{IntoDiagnostic, Result}; @@ -658,7 +658,7 @@ impl From for ProcessStatus { #[cfg(test)] mod tests { use super::*; - use crate::policy::{ + use openshell_core::policy::{ FilesystemPolicy, LandlockPolicy, NetworkPolicy, ProcessPolicy, SandboxPolicy, }; #[cfg(unix)] diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index 30466a465..5eff84623 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -7,12 +7,12 @@ use crate::denial_aggregator::DenialEvent; use crate::identity::BinaryIdentityCache; use crate::l7::tls::ProxyTlsState; use crate::opa::{NetworkAction, OpaEngine, PolicyGenerationGuard}; -use crate::policy::ProxyPolicy; use crate::policy_local::{POLICY_LOCAL_HOST, PolicyLocalContext}; use crate::provider_credentials::ProviderCredentialState; use crate::secrets::{SecretResolver, rewrite_header_line_checked}; use miette::{IntoDiagnostic, Result}; use openshell_core::net::{is_always_blocked_ip, is_internal_ip, is_link_local_ip}; +use openshell_core::policy::ProxyPolicy; use openshell_ocsf::{ ActionId, ActivityId, DispositionId, Endpoint, HttpActivityBuilder, HttpRequest, NetworkActivityBuilder, Process, SeverityId, StatusId, Url as OcsfUrl, ocsf_emit, diff --git a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs index 6b121e0ca..fc5e660d3 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs @@ -3,11 +3,11 @@ //! Landlock filesystem sandboxing. -use crate::policy::{LandlockCompatibility, SandboxPolicy}; use landlock::{ ABI, Access, AccessFs, CompatLevel, Compatible, PathBeneath, PathFd, PathFdError, Ruleset, RulesetAttr, RulesetCreatedAttr, }; +use openshell_core::policy::{LandlockCompatibility, SandboxPolicy}; use miette::{IntoDiagnostic, Result}; use std::path::{Path, PathBuf}; use tracing::debug; diff --git a/crates/openshell-sandbox/src/sandbox/linux/mod.rs b/crates/openshell-sandbox/src/sandbox/linux/mod.rs index a3a32c77a..487d8b4ad 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/mod.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/mod.rs @@ -8,8 +8,8 @@ pub mod netns; mod nft_ruleset; mod seccomp; -use crate::policy::SandboxPolicy; use miette::Result; +use openshell_core::policy::SandboxPolicy; use std::path::PathBuf; use std::sync::Once; @@ -118,7 +118,7 @@ pub fn log_sandbox_readiness(policy: &SandboxPolicy, workdir: Option<&str>) { // previously invisible because it only fired inside pre_exec. let is_best_effort = matches!( policy.landlock.compatibility, - crate::policy::LandlockCompatibility::BestEffort + openshell_core::policy::LandlockCompatibility::BestEffort ); let (desc, msg) = if is_best_effort { ( diff --git a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs index 1044623f5..4933cd181 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs @@ -12,8 +12,8 @@ //! needed syscalls (`execveat+AT_EMPTY_PATH`, `unshare+CLONE_NEWUSER`, //! `seccomp+SET_MODE_FILTER`) -use crate::policy::{NetworkMode, SandboxPolicy}; use miette::{IntoDiagnostic, Result}; +use openshell_core::policy::{NetworkMode, SandboxPolicy}; use seccompiler::{ SeccompAction, SeccompCmpArgLen, SeccompCmpOp, SeccompCondition, SeccompFilter, SeccompRule, apply_filter, apply_filter_all_threads, diff --git a/crates/openshell-sandbox/src/sandbox/mod.rs b/crates/openshell-sandbox/src/sandbox/mod.rs index 95aeae492..067e60ea5 100644 --- a/crates/openshell-sandbox/src/sandbox/mod.rs +++ b/crates/openshell-sandbox/src/sandbox/mod.rs @@ -3,8 +3,8 @@ //! Platform sandboxing implementation. -use crate::policy::SandboxPolicy; use miette::Result; +use openshell_core::policy::SandboxPolicy; #[cfg(target_os = "linux")] pub mod linux; diff --git a/crates/openshell-sandbox/src/ssh.rs b/crates/openshell-sandbox/src/ssh.rs index 67fbc7e57..b0904a504 100644 --- a/crates/openshell-sandbox/src/ssh.rs +++ b/crates/openshell-sandbox/src/ssh.rs @@ -4,10 +4,10 @@ //! Embedded SSH server for sandbox access. use crate::child_env; -use crate::policy::SandboxPolicy; use crate::process::drop_privileges; use crate::provider_credentials::ProviderCredentialState; use crate::sandbox; +use openshell_core::policy::SandboxPolicy; #[cfg(target_os = "linux")] use crate::{register_managed_child, unregister_managed_child}; use miette::{IntoDiagnostic, Result}; @@ -1470,7 +1470,7 @@ mod tests { #[cfg(unix)] #[test] fn pre_exec_always_calls_drop_privileges() { - use crate::policy::{ + use openshell_core::policy::{ FilesystemPolicy, LandlockPolicy, NetworkPolicy, ProcessPolicy, SandboxPolicy, }; From 2bade765ee0e4d85339861155e20518a9493c4f8 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 18:46:59 +0300 Subject: [PATCH 07/49] refactor(supervisor-process): move child_env from openshell-sandbox MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit child_env (proxy_env_vars, tls_env_vars) is process-side only — consumed by process.rs and ssh.rs. With the orchestrator staying in openshell-sandbox (Shape A), openshell-sandbox depends on the new leaf crates, so process-only modules can land in openshell-supervisor-process directly. Add openshell-supervisor-process as a path dependency of openshell-sandbox. Update process.rs and ssh.rs to import from openshell_supervisor_process::child_env. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 1 + crates/openshell-sandbox/Cargo.toml | 1 + crates/openshell-sandbox/src/lib.rs | 1 - crates/openshell-sandbox/src/process.rs | 2 +- crates/openshell-sandbox/src/ssh.rs | 2 +- .../src/child_env.rs | 0 crates/openshell-supervisor-process/src/lib.rs | 4 +++- 7 files changed, 7 insertions(+), 4 deletions(-) rename crates/{openshell-sandbox => openshell-supervisor-process}/src/child_env.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 22f02198b..1859525ad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3656,6 +3656,7 @@ dependencies = [ "openshell-ocsf", "openshell-policy", "openshell-router", + "openshell-supervisor-process", "rand_core 0.6.4", "rcgen", "regorus", diff --git a/crates/openshell-sandbox/Cargo.toml b/crates/openshell-sandbox/Cargo.toml index 6d527bc53..e4ae77fd4 100644 --- a/crates/openshell-sandbox/Cargo.toml +++ b/crates/openshell-sandbox/Cargo.toml @@ -19,6 +19,7 @@ openshell-core = { path = "../openshell-core" } openshell-ocsf = { path = "../openshell-ocsf" } openshell-policy = { path = "../openshell-policy" } openshell-router = { path = "../openshell-router" } +openshell-supervisor-process = { path = "../openshell-supervisor-process" } # Async runtime tokio = { workspace = true } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index b7e889d70..7b0afd78e 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -6,7 +6,6 @@ //! This crate provides process sandboxing and monitoring capabilities. pub mod bypass_monitor; -mod child_env; pub mod debug_rpc; pub mod denial_aggregator; mod grpc_client; diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index ce32402a8..69b6649d2 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -3,11 +3,11 @@ //! Process management and signal handling. -use crate::child_env; use crate::sandbox; #[cfg(target_os = "linux")] use crate::sandbox::linux::netns::NetworkNamespace; use openshell_core::policy::{NetworkMode, SandboxPolicy}; +use openshell_supervisor_process::child_env; #[cfg(target_os = "linux")] use crate::{register_managed_child, unregister_managed_child}; use miette::{IntoDiagnostic, Result}; diff --git a/crates/openshell-sandbox/src/ssh.rs b/crates/openshell-sandbox/src/ssh.rs index b0904a504..3a2a1142b 100644 --- a/crates/openshell-sandbox/src/ssh.rs +++ b/crates/openshell-sandbox/src/ssh.rs @@ -3,11 +3,11 @@ //! Embedded SSH server for sandbox access. -use crate::child_env; use crate::process::drop_privileges; use crate::provider_credentials::ProviderCredentialState; use crate::sandbox; use openshell_core::policy::SandboxPolicy; +use openshell_supervisor_process::child_env; #[cfg(target_os = "linux")] use crate::{register_managed_child, unregister_managed_child}; use miette::{IntoDiagnostic, Result}; diff --git a/crates/openshell-sandbox/src/child_env.rs b/crates/openshell-supervisor-process/src/child_env.rs similarity index 100% rename from crates/openshell-sandbox/src/child_env.rs rename to crates/openshell-supervisor-process/src/child_env.rs diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs index 4ba7d4871..5f89367d4 100644 --- a/crates/openshell-supervisor-process/src/lib.rs +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -1,9 +1,11 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! Process component of the OpenShell supervisor. +//! Process component of the `OpenShell` supervisor. //! //! Owns the entrypoint process spawn, SSH server, supervisor session, network //! namespace, bypass monitor, child environment construction, skills install, //! and log push. Populated by follow-up commits as modules migrate out of //! `openshell-sandbox`. + +pub mod child_env; From 1327bbada7f8eb35333b4decda69304a70a15eaa Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 18:54:36 +0300 Subject: [PATCH 08/49] refactor(supervisor-process): move skills from openshell-sandbox MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the static skills installer (and its embedded resource directory) out of openshell-sandbox into openshell-supervisor-process. The module is process-side only — invoked once during sandbox start to drop agent skill files into the workspace — and has no cross-leaf consumers. Adds miette as a dependency and tempfile as a dev-dependency on openshell-supervisor-process. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 4 ++++ crates/openshell-sandbox/src/lib.rs | 2 +- crates/openshell-supervisor-process/Cargo.toml | 2 ++ crates/openshell-supervisor-process/src/lib.rs | 1 + .../src/skills.rs | 0 .../src/skills/policy-advisor/SKILL.md | 0 .../src/skills/policy_advisor.md | 0 7 files changed, 8 insertions(+), 1 deletion(-) rename crates/{openshell-sandbox => openshell-supervisor-process}/src/skills.rs (100%) rename crates/{openshell-sandbox => openshell-supervisor-process}/src/skills/policy-advisor/SKILL.md (100%) rename crates/{openshell-sandbox => openshell-supervisor-process}/src/skills/policy_advisor.md (100%) diff --git a/Cargo.lock b/Cargo.lock index 1859525ad..1610d6182 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3771,6 +3771,10 @@ version = "0.0.0" [[package]] name = "openshell-supervisor-process" version = "0.0.0" +dependencies = [ + "miette", + "tempfile", +] [[package]] name = "openshell-tui" diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 7b0afd78e..7e2179515 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -21,7 +21,6 @@ mod provider_credentials; pub mod proxy; mod sandbox; mod secrets; -mod skills; mod ssh; mod supervisor_session; @@ -174,6 +173,7 @@ use crate::l7::tls::{ use crate::opa::OpaEngine; use crate::proxy::ProxyHandle; use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; +use openshell_supervisor_process::skills; #[cfg(target_os = "linux")] use crate::sandbox::linux::netns::NetworkNamespace; pub use process::{ProcessHandle, ProcessStatus}; diff --git a/crates/openshell-supervisor-process/Cargo.toml b/crates/openshell-supervisor-process/Cargo.toml index faf379f51..ec90b9dbe 100644 --- a/crates/openshell-supervisor-process/Cargo.toml +++ b/crates/openshell-supervisor-process/Cargo.toml @@ -11,8 +11,10 @@ repository.workspace = true rust-version.workspace = true [dependencies] +miette = { workspace = true } [dev-dependencies] +tempfile = "3" [lints] workspace = true diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs index 5f89367d4..26f9f5650 100644 --- a/crates/openshell-supervisor-process/src/lib.rs +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -9,3 +9,4 @@ //! `openshell-sandbox`. pub mod child_env; +pub mod skills; diff --git a/crates/openshell-sandbox/src/skills.rs b/crates/openshell-supervisor-process/src/skills.rs similarity index 100% rename from crates/openshell-sandbox/src/skills.rs rename to crates/openshell-supervisor-process/src/skills.rs diff --git a/crates/openshell-sandbox/src/skills/policy-advisor/SKILL.md b/crates/openshell-supervisor-process/src/skills/policy-advisor/SKILL.md similarity index 100% rename from crates/openshell-sandbox/src/skills/policy-advisor/SKILL.md rename to crates/openshell-supervisor-process/src/skills/policy-advisor/SKILL.md diff --git a/crates/openshell-sandbox/src/skills/policy_advisor.md b/crates/openshell-supervisor-process/src/skills/policy_advisor.md similarity index 100% rename from crates/openshell-sandbox/src/skills/policy_advisor.md rename to crates/openshell-supervisor-process/src/skills/policy_advisor.md From 8bbdbabc7cb8301597e0fe936c0968a1bc86bd6c Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 18:59:21 +0300 Subject: [PATCH 09/49] refactor(supervisor-networking): move mechanistic_mapper from openshell-sandbox MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the mechanistic mapper (HTTP method/path → operation classifier that derives policy proposals from connection summaries) out of openshell-sandbox into openshell-supervisor-networking. Single internal caller (run_policy_poll_loop in lib.rs) and only depends on openshell-core + tracing — no cross-leaf entanglement. First population of the openshell-supervisor-networking crate; adds openshell-core and tracing as dependencies. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 5 +++++ crates/openshell-sandbox/Cargo.toml | 1 + crates/openshell-sandbox/src/lib.rs | 2 +- crates/openshell-supervisor-networking/Cargo.toml | 3 +++ crates/openshell-supervisor-networking/src/lib.rs | 4 +++- .../src/mechanistic_mapper.rs | 0 6 files changed, 13 insertions(+), 2 deletions(-) rename crates/{openshell-sandbox => openshell-supervisor-networking}/src/mechanistic_mapper.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 1610d6182..af86c0a62 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3656,6 +3656,7 @@ dependencies = [ "openshell-ocsf", "openshell-policy", "openshell-router", + "openshell-supervisor-networking", "openshell-supervisor-process", "rand_core 0.6.4", "rcgen", @@ -3767,6 +3768,10 @@ dependencies = [ [[package]] name = "openshell-supervisor-networking" version = "0.0.0" +dependencies = [ + "openshell-core", + "tracing", +] [[package]] name = "openshell-supervisor-process" diff --git a/crates/openshell-sandbox/Cargo.toml b/crates/openshell-sandbox/Cargo.toml index e4ae77fd4..e6ed06696 100644 --- a/crates/openshell-sandbox/Cargo.toml +++ b/crates/openshell-sandbox/Cargo.toml @@ -19,6 +19,7 @@ openshell-core = { path = "../openshell-core" } openshell-ocsf = { path = "../openshell-ocsf" } openshell-policy = { path = "../openshell-policy" } openshell-router = { path = "../openshell-router" } +openshell-supervisor-networking = { path = "../openshell-supervisor-networking" } openshell-supervisor-process = { path = "../openshell-supervisor-process" } # Async runtime diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 7e2179515..396588f40 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -12,7 +12,6 @@ mod grpc_client; mod identity; pub mod l7; pub mod log_push; -pub mod mechanistic_mapper; pub mod opa; mod policy_local; mod process; @@ -174,6 +173,7 @@ use crate::opa::OpaEngine; use crate::proxy::ProxyHandle; use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; use openshell_supervisor_process::skills; +use openshell_supervisor_networking::mechanistic_mapper; #[cfg(target_os = "linux")] use crate::sandbox::linux::netns::NetworkNamespace; pub use process::{ProcessHandle, ProcessStatus}; diff --git a/crates/openshell-supervisor-networking/Cargo.toml b/crates/openshell-supervisor-networking/Cargo.toml index 8d3e110e7..30df490b3 100644 --- a/crates/openshell-supervisor-networking/Cargo.toml +++ b/crates/openshell-supervisor-networking/Cargo.toml @@ -11,6 +11,9 @@ repository.workspace = true rust-version.workspace = true [dependencies] +openshell-core = { path = "../openshell-core" } + +tracing = { workspace = true } [dev-dependencies] diff --git a/crates/openshell-supervisor-networking/src/lib.rs b/crates/openshell-supervisor-networking/src/lib.rs index 4dcda101c..b6f210c98 100644 --- a/crates/openshell-supervisor-networking/src/lib.rs +++ b/crates/openshell-supervisor-networking/src/lib.rs @@ -1,8 +1,10 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! Networking component of the OpenShell supervisor. +//! Networking component of the `OpenShell` supervisor. //! //! Owns the egress proxy, L7 enforcement, OPA policy engine, identity cache, //! inference routing, TLS interception, and denial aggregation. Populated by //! follow-up commits as modules migrate out of `openshell-sandbox`. + +pub mod mechanistic_mapper; diff --git a/crates/openshell-sandbox/src/mechanistic_mapper.rs b/crates/openshell-supervisor-networking/src/mechanistic_mapper.rs similarity index 100% rename from crates/openshell-sandbox/src/mechanistic_mapper.rs rename to crates/openshell-supervisor-networking/src/mechanistic_mapper.rs From 6da781db91b5eb2741c3428d03f523e12e1f35ed Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 19:12:14 +0300 Subject: [PATCH 10/49] refactor(core): lift procfs to openshell-core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move procfs (PID lookups, ancestor walking, /proc/net/tcp socket-owner resolution, file SHA256 hashing) from openshell-sandbox into openshell-core. The module is consumed cross-leaf — by bypass_monitor on the process side and by identity / proxy on the networking side — so it has to sit below both leaves. Adds tracing, sha2, and hex as dependencies on openshell-core. Updates the three call sites in openshell-sandbox to import from openshell_core::procfs. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 3 +++ crates/openshell-core/Cargo.toml | 3 +++ crates/openshell-core/src/lib.rs | 1 + .../{openshell-sandbox => openshell-core}/src/procfs.rs | 0 crates/openshell-sandbox/src/bypass_monitor.rs | 2 +- crates/openshell-sandbox/src/identity.rs | 4 ++-- crates/openshell-sandbox/src/lib.rs | 1 - crates/openshell-sandbox/src/proxy.rs | 8 ++++---- 8 files changed, 14 insertions(+), 8 deletions(-) rename crates/{openshell-sandbox => openshell-core}/src/procfs.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index af86c0a62..20161e444 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3457,6 +3457,7 @@ dependencies = [ name = "openshell-core" version = "0.0.0" dependencies = [ + "hex", "ipnet", "miette", "prost", @@ -3464,10 +3465,12 @@ dependencies = [ "protobuf-src", "serde", "serde_json", + "sha2 0.10.9", "tempfile", "thiserror 2.0.18", "tonic", "tonic-build", + "tracing", "url", ] diff --git a/crates/openshell-core/Cargo.toml b/crates/openshell-core/Cargo.toml index b03fb1494..76b8a253f 100644 --- a/crates/openshell-core/Cargo.toml +++ b/crates/openshell-core/Cargo.toml @@ -18,8 +18,11 @@ thiserror = { workspace = true } miette = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } +tracing = { workspace = true } url = { workspace = true } ipnet = "2" +hex = "0.4" +sha2 = { workspace = true } [features] ## Include test-only settings (dummy_bool, dummy_int) in the registry. diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index 681cf9e99..82dc5aadf 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -22,6 +22,7 @@ pub mod metadata; pub mod net; pub mod paths; pub mod policy; +pub mod procfs; pub mod progress; pub mod proto; pub mod sandbox_env; diff --git a/crates/openshell-sandbox/src/procfs.rs b/crates/openshell-core/src/procfs.rs similarity index 100% rename from crates/openshell-sandbox/src/procfs.rs rename to crates/openshell-core/src/procfs.rs diff --git a/crates/openshell-sandbox/src/bypass_monitor.rs b/crates/openshell-sandbox/src/bypass_monitor.rs index 9e37ef27c..3b71cd978 100644 --- a/crates/openshell-sandbox/src/bypass_monitor.rs +++ b/crates/openshell-sandbox/src/bypass_monitor.rs @@ -295,7 +295,7 @@ pub fn spawn( fn resolve_process_identity(entrypoint_pid: u32, src_port: u16) -> (String, String, String) { #[cfg(target_os = "linux")] { - use crate::procfs; + use openshell_core::procfs; match procfs::resolve_tcp_peer_socket_owners(entrypoint_pid, src_port) { Ok(socket_owners) => { diff --git a/crates/openshell-sandbox/src/identity.rs b/crates/openshell-sandbox/src/identity.rs index 49809f95b..2da4072c6 100644 --- a/crates/openshell-sandbox/src/identity.rs +++ b/crates/openshell-sandbox/src/identity.rs @@ -8,7 +8,7 @@ //! path must match the cached hash. A mismatch indicates the binary was replaced //! mid-sandbox and the request is denied. -use crate::procfs; +use openshell_core::procfs; use miette::Result; use std::collections::HashMap; use std::fs::Metadata; @@ -169,7 +169,7 @@ impl BinaryIdentityCache { #[cfg(test)] mod tests { use super::*; - use crate::procfs; + use openshell_core::procfs; use std::io::Write; use std::time::Duration; diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 396588f40..2727b3984 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -15,7 +15,6 @@ pub mod log_push; pub mod opa; mod policy_local; mod process; -pub mod procfs; mod provider_credentials; pub mod proxy; mod sandbox; diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index 5eff84623..1fb80a3e4 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -1179,7 +1179,7 @@ fn resolve_owner_identity( identity_cache: &BinaryIdentityCache, ) -> std::result::Result { let bin_path = - crate::procfs::binary_path(owner_pid.cast_signed()).map_err(|e| IdentityError { + openshell_core::procfs::binary_path(owner_pid.cast_signed()).map_err(|e| IdentityError { reason: format!("failed to resolve peer binary for PID {owner_pid}: {e}"), binary: None, binary_pid: Some(owner_pid), @@ -1195,7 +1195,7 @@ fn resolve_owner_identity( ancestors: vec![], })?; - let ancestors = crate::procfs::collect_ancestor_binaries(owner_pid, entrypoint_pid); + let ancestors = openshell_core::procfs::collect_ancestor_binaries(owner_pid, entrypoint_pid); for ancestor in &ancestors { identity_cache @@ -1213,7 +1213,7 @@ fn resolve_owner_identity( let mut exclude = ancestors.clone(); exclude.push(bin_path.clone()); - let cmdline_paths = crate::procfs::collect_cmdline_paths(owner_pid, entrypoint_pid, &exclude); + let cmdline_paths = openshell_core::procfs::collect_cmdline_paths(owner_pid, entrypoint_pid, &exclude); Ok(ResolvedIdentity { bin_path, @@ -1243,7 +1243,7 @@ fn resolve_process_identity( peer_port: u16, identity_cache: &BinaryIdentityCache, ) -> std::result::Result { - let socket_owners = crate::procfs::resolve_tcp_peer_socket_owners(entrypoint_pid, peer_port) + let socket_owners = openshell_core::procfs::resolve_tcp_peer_socket_owners(entrypoint_pid, peer_port) .map_err(|e| IdentityError { reason: format!("failed to resolve peer binary: {e}"), binary: None, From 509be1c02339a0ab19030e6e947f55e06fdd18a3 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 19:16:18 +0300 Subject: [PATCH 11/49] refactor(supervisor-networking): move identity from openshell-sandbox MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move BinaryIdentityCache (path → SHA256 cache used to identify the process behind an outbound connection) from openshell-sandbox into openshell-supervisor-networking. The cache is consumed only by the networking-side proxy and the orchestrator; with procfs already in openshell-core there are no remaining cross-leaf dependencies. Adds miette as a dependency and tempfile as a dev-dependency on openshell-supervisor-networking. Adds a Default impl for BinaryIdentityCache to satisfy clippy::new_without_default now that the type is publicly exposed across crates. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 2 ++ crates/openshell-sandbox/src/lib.rs | 3 +-- crates/openshell-sandbox/src/proxy.rs | 6 +++--- crates/openshell-supervisor-networking/Cargo.toml | 2 ++ .../src/identity.rs | 6 ++++++ crates/openshell-supervisor-networking/src/lib.rs | 1 + 6 files changed, 15 insertions(+), 5 deletions(-) rename crates/{openshell-sandbox => openshell-supervisor-networking}/src/identity.rs (99%) diff --git a/Cargo.lock b/Cargo.lock index 20161e444..5fa2bceb6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3772,7 +3772,9 @@ dependencies = [ name = "openshell-supervisor-networking" version = "0.0.0" dependencies = [ + "miette", "openshell-core", + "tempfile", "tracing", ] diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 2727b3984..29034db01 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -9,7 +9,6 @@ pub mod bypass_monitor; pub mod debug_rpc; pub mod denial_aggregator; mod grpc_client; -mod identity; pub mod l7; pub mod log_push; pub mod opa; @@ -163,7 +162,7 @@ pub(crate) mod test_helpers { } } -use crate::identity::BinaryIdentityCache; +use openshell_supervisor_networking::identity::BinaryIdentityCache; use crate::l7::tls::{ CertCache, ProxyTlsState, SandboxCa, build_upstream_client_config, read_system_ca_bundle, write_ca_files, diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index 1fb80a3e4..570311738 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -4,7 +4,7 @@ //! HTTP CONNECT proxy with OPA policy evaluation and process-identity binding. use crate::denial_aggregator::DenialEvent; -use crate::identity::BinaryIdentityCache; +use openshell_supervisor_networking::identity::BinaryIdentityCache; use crate::l7::tls::ProxyTlsState; use crate::opa::{NetworkAction, OpaEngine, PolicyGenerationGuard}; use crate::policy_local::{POLICY_LOCAL_HOST, PolicyLocalContext}; @@ -6645,7 +6645,7 @@ network_policies: #[cfg(target_os = "linux")] #[test] fn resolve_process_identity_surfaces_binary_integrity_violation_on_hot_swap() { - use crate::identity::BinaryIdentityCache; + use openshell_supervisor_networking::identity::BinaryIdentityCache; use std::io::Read; use std::net::TcpListener; use std::os::unix::fs::PermissionsExt; @@ -6777,7 +6777,7 @@ network_policies: // SELinux-enforcing hosts. Fix by building a test-sleep-helper binary in // the same crate so it inherits the user_home_t label. fn resolve_process_identity_denies_fork_exec_shared_socket_ambiguity() { - use crate::identity::BinaryIdentityCache; + use openshell_supervisor_networking::identity::BinaryIdentityCache; use std::ffi::CString; use std::net::{TcpListener, TcpStream}; use std::os::fd::AsRawFd; diff --git a/crates/openshell-supervisor-networking/Cargo.toml b/crates/openshell-supervisor-networking/Cargo.toml index 30df490b3..10011766e 100644 --- a/crates/openshell-supervisor-networking/Cargo.toml +++ b/crates/openshell-supervisor-networking/Cargo.toml @@ -13,9 +13,11 @@ rust-version.workspace = true [dependencies] openshell-core = { path = "../openshell-core" } +miette = { workspace = true } tracing = { workspace = true } [dev-dependencies] +tempfile = "3" [lints] workspace = true diff --git a/crates/openshell-sandbox/src/identity.rs b/crates/openshell-supervisor-networking/src/identity.rs similarity index 99% rename from crates/openshell-sandbox/src/identity.rs rename to crates/openshell-supervisor-networking/src/identity.rs index 2da4072c6..73a62ff8d 100644 --- a/crates/openshell-sandbox/src/identity.rs +++ b/crates/openshell-supervisor-networking/src/identity.rs @@ -79,6 +79,12 @@ pub struct BinaryIdentityCache { hashes: Mutex>, } +impl Default for BinaryIdentityCache { + fn default() -> Self { + Self::new() + } +} + impl BinaryIdentityCache { pub fn new() -> Self { Self { diff --git a/crates/openshell-supervisor-networking/src/lib.rs b/crates/openshell-supervisor-networking/src/lib.rs index b6f210c98..5b1f907e6 100644 --- a/crates/openshell-supervisor-networking/src/lib.rs +++ b/crates/openshell-supervisor-networking/src/lib.rs @@ -8,3 +8,4 @@ //! follow-up commits as modules migrate out of `openshell-sandbox`. pub mod mechanistic_mapper; +pub mod identity; From 058bc45615583a838f1ed14c0a34b8cedd069058 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 19:35:23 +0300 Subject: [PATCH 12/49] refactor(supervisor-process): move agent-proposals flag from openshell-sandbox Move AGENT_PROPOSALS_ENABLED, agent_proposals_enabled(), and the test-only ProposalsFlagGuard out of openshell-sandbox into openshell-supervisor-process::proposals. The flag is read only by the process-side policy_local route handler and the orchestrator; lifting it to openshell-core would have made core carry sandbox-owned runtime state without buying anything. The test-only ProposalsFlagGuard is still consumed from networking-side l7/rest tests today (until the wider Q2 OCSF/gRPC injection work lands). Expose it via a new optional `test-helpers` feature on openshell-supervisor-process so test crates opt in explicitly without pulling tokio sync primitives into production builds. openshell-sandbox keeps its existing crate-private path (`crate::AGENT_PROPOSALS_ENABLED`, `crate::test_helpers`) via re-exports so call sites and tests are unchanged. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 1 + crates/openshell-sandbox/Cargo.toml | 1 + crates/openshell-sandbox/src/lib.rs | 71 +--------------- .../openshell-supervisor-process/Cargo.toml | 7 ++ .../openshell-supervisor-process/src/lib.rs | 1 + .../src/proposals.rs | 83 +++++++++++++++++++ 6 files changed, 97 insertions(+), 67 deletions(-) create mode 100644 crates/openshell-supervisor-process/src/proposals.rs diff --git a/Cargo.lock b/Cargo.lock index 5fa2bceb6..6adefbac2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3784,6 +3784,7 @@ version = "0.0.0" dependencies = [ "miette", "tempfile", + "tokio", ] [[package]] diff --git a/crates/openshell-sandbox/Cargo.toml b/crates/openshell-sandbox/Cargo.toml index e6ed06696..be575d971 100644 --- a/crates/openshell-sandbox/Cargo.toml +++ b/crates/openshell-sandbox/Cargo.toml @@ -92,6 +92,7 @@ tempfile = "3" uuid = { version = "1", features = ["v4"] } [dev-dependencies] +openshell-supervisor-process = { path = "../openshell-supervisor-process", features = ["test-helpers"] } tempfile = "3" temp-env = "0.3" tokio-tungstenite = { workspace = true } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 29034db01..8c13b25f0 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -92,75 +92,12 @@ pub(crate) fn ocsf_ctx() -> &'static SandboxContext { /// to gate the agent-controlled mutation surface. Exposed `pub(crate)` so /// unit tests in sibling modules can flip the flag through a serialized /// guard (see `policy_local::tests::ProposalsFlagGuard`). -pub(crate) static AGENT_PROPOSALS_ENABLED: OnceLock> = - OnceLock::new(); - -/// Read the current value of the agent proposals feature flag. -/// -/// Returns `false` if `run_sandbox()` has not initialized the flag (e.g. -/// during unit tests), matching the documented default for the setting. -pub(crate) fn agent_proposals_enabled() -> bool { - AGENT_PROPOSALS_ENABLED - .get() - .is_some_and(|flag| flag.load(Ordering::Relaxed)) -} +pub(crate) use openshell_supervisor_process::proposals::{ + AGENT_PROPOSALS_ENABLED, agent_proposals_enabled, +}; -/// Test-only helpers shared across sibling test modules. #[cfg(test)] -pub(crate) mod test_helpers { - #![allow( - clippy::redundant_pub_crate, - reason = "intentional crate-private module" - )] - use std::sync::Arc; - use std::sync::LazyLock; - use std::sync::atomic::{AtomicBool, Ordering}; - use tokio::sync::MutexGuard; - - static PROPOSALS_FLAG_LOCK: LazyLock> = - LazyLock::new(|| tokio::sync::Mutex::new(())); - - /// Guard for tests that toggle the process-wide - /// `AGENT_PROPOSALS_ENABLED` flag. Acquires a process-wide async mutex, - /// swaps in the requested value, and restores the previous value on drop. - /// Hold the guard for the duration of any code that reads - /// `agent_proposals_enabled()`. - pub(crate) struct ProposalsFlagGuard { - prev: bool, - flag: Arc, - _lock: MutexGuard<'static, ()>, - } - - impl ProposalsFlagGuard { - pub(crate) async fn set(enabled: bool) -> Self { - let lock = PROPOSALS_FLAG_LOCK.lock().await; - Self::with_lock(enabled, lock) - } - - pub(crate) fn set_blocking(enabled: bool) -> Self { - let lock = PROPOSALS_FLAG_LOCK.blocking_lock(); - Self::with_lock(enabled, lock) - } - - fn with_lock(enabled: bool, lock: MutexGuard<'static, ()>) -> Self { - let flag = super::AGENT_PROPOSALS_ENABLED - .get_or_init(|| Arc::new(AtomicBool::new(false))) - .clone(); - let prev = flag.swap(enabled, Ordering::Relaxed); - Self { - prev, - flag, - _lock: lock, - } - } - } - - impl Drop for ProposalsFlagGuard { - fn drop(&mut self) { - self.flag.store(self.prev, Ordering::Relaxed); - } - } -} +pub(crate) use openshell_supervisor_process::proposals::test_helpers; use openshell_supervisor_networking::identity::BinaryIdentityCache; use crate::l7::tls::{ diff --git a/crates/openshell-supervisor-process/Cargo.toml b/crates/openshell-supervisor-process/Cargo.toml index ec90b9dbe..cab945aa1 100644 --- a/crates/openshell-supervisor-process/Cargo.toml +++ b/crates/openshell-supervisor-process/Cargo.toml @@ -12,6 +12,13 @@ rust-version.workspace = true [dependencies] miette = { workspace = true } +tokio = { workspace = true } + +[features] +## Expose proposals::test_helpers (`ProposalsFlagGuard`) to downstream test +## code in other crates. Enabled by openshell-supervisor-networking and +## openshell-sandbox dev-dependencies. +test-helpers = [] [dev-dependencies] tempfile = "3" diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs index 26f9f5650..b8b19caae 100644 --- a/crates/openshell-supervisor-process/src/lib.rs +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -9,4 +9,5 @@ //! `openshell-sandbox`. pub mod child_env; +pub mod proposals; pub mod skills; diff --git a/crates/openshell-supervisor-process/src/proposals.rs b/crates/openshell-supervisor-process/src/proposals.rs new file mode 100644 index 000000000..fcb6b110c --- /dev/null +++ b/crates/openshell-supervisor-process/src/proposals.rs @@ -0,0 +1,83 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Process-wide flag controlling agent-driven policy proposals. +//! +//! Initialised once during sandbox start from the `agent_policy_proposals_enabled` +//! setting and updated by the policy poll loop when the setting changes. Read +//! by the `policy.local` route handler and by the skills installer to gate the +//! agent-controlled mutation surface. Tests use [`test_helpers::ProposalsFlagGuard`] +//! to flip the flag through a serialized guard. + +use std::sync::Arc; +use std::sync::OnceLock; +use std::sync::atomic::{AtomicBool, Ordering}; + +/// Process-wide handle to the agent-proposals flag. +/// +/// Set once by `run_sandbox()` during start; subsequent attempts to set it are +/// ignored. The contained `AtomicBool` is updated by the policy poll loop. +pub static AGENT_PROPOSALS_ENABLED: OnceLock> = OnceLock::new(); + +/// Read the current value of the agent proposals feature flag. +/// +/// Returns `false` if the flag has not been initialized (e.g. during unit +/// tests), matching the documented default for the setting. +pub fn agent_proposals_enabled() -> bool { + AGENT_PROPOSALS_ENABLED + .get() + .is_some_and(|flag| flag.load(Ordering::Relaxed)) +} + +/// Test-only helpers shared across crates' test modules. +#[cfg(any(test, feature = "test-helpers"))] +pub mod test_helpers { + use std::sync::Arc; + use std::sync::LazyLock; + use std::sync::atomic::{AtomicBool, Ordering}; + use tokio::sync::MutexGuard; + + static PROPOSALS_FLAG_LOCK: LazyLock> = + LazyLock::new(|| tokio::sync::Mutex::new(())); + + /// Guard for tests that toggle the process-wide flag. + /// + /// Acquires a process-wide async mutex, swaps in the requested value, and + /// restores the previous value on drop. Hold the guard for the duration of + /// any code that reads `agent_proposals_enabled()`. + pub struct ProposalsFlagGuard { + prev: bool, + flag: Arc, + _lock: MutexGuard<'static, ()>, + } + + impl ProposalsFlagGuard { + pub async fn set(enabled: bool) -> Self { + let lock = PROPOSALS_FLAG_LOCK.lock().await; + Self::with_lock(enabled, lock) + } + + pub fn set_blocking(enabled: bool) -> Self { + let lock = PROPOSALS_FLAG_LOCK.blocking_lock(); + Self::with_lock(enabled, lock) + } + + fn with_lock(enabled: bool, lock: MutexGuard<'static, ()>) -> Self { + let flag = super::AGENT_PROPOSALS_ENABLED + .get_or_init(|| Arc::new(AtomicBool::new(false))) + .clone(); + let prev = flag.swap(enabled, Ordering::Relaxed); + Self { + prev, + flag, + _lock: lock, + } + } + } + + impl Drop for ProposalsFlagGuard { + fn drop(&mut self) { + self.flag.store(self.prev, Ordering::Relaxed); + } + } +} From b90f7ee82727df1995886421653f19cfad6aae65 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 20:12:13 +0300 Subject: [PATCH 13/49] refactor(core): lift secrets to openshell-core Move crates/openshell-sandbox/src/secrets.rs to crates/openshell-core/src/secrets.rs so both supervisor leaves can reach SecretResolver and the placeholder helpers without depending on openshell-sandbox. Add base64 to openshell-core deps (only stdlib + base64 are used). Promote previously pub(crate) constructors and methods on SecretResolver to pub since cross-crate callers (provider_credentials, proxy/L7 tests) now name them across the crate boundary. Update import paths in proxy.rs, l7/{rest,relay,websocket}.rs, and provider_credentials.rs from crate::secrets to openshell_core::secrets. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 1 + crates/openshell-core/Cargo.toml | 1 + crates/openshell-core/src/lib.rs | 1 + .../src/secrets.rs | 16 ++++++++-------- crates/openshell-sandbox/src/l7/relay.rs | 2 +- crates/openshell-sandbox/src/l7/rest.rs | 4 ++-- crates/openshell-sandbox/src/l7/websocket.rs | 4 ++-- crates/openshell-sandbox/src/lib.rs | 1 - .../src/provider_credentials.rs | 2 +- crates/openshell-sandbox/src/proxy.rs | 12 ++++++------ 10 files changed, 23 insertions(+), 21 deletions(-) rename crates/{openshell-sandbox => openshell-core}/src/secrets.rs (99%) diff --git a/Cargo.lock b/Cargo.lock index 6adefbac2..a12d66ca3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3457,6 +3457,7 @@ dependencies = [ name = "openshell-core" version = "0.0.0" dependencies = [ + "base64 0.22.1", "hex", "ipnet", "miette", diff --git a/crates/openshell-core/Cargo.toml b/crates/openshell-core/Cargo.toml index 76b8a253f..729aef3d3 100644 --- a/crates/openshell-core/Cargo.toml +++ b/crates/openshell-core/Cargo.toml @@ -23,6 +23,7 @@ url = { workspace = true } ipnet = "2" hex = "0.4" sha2 = { workspace = true } +base64 = { workspace = true } [features] ## Include test-only settings (dummy_bool, dummy_int) in the registry. diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index 82dc5aadf..b98186ca4 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -26,6 +26,7 @@ pub mod procfs; pub mod progress; pub mod proto; pub mod sandbox_env; +pub mod secrets; pub mod settings; pub mod time; diff --git a/crates/openshell-sandbox/src/secrets.rs b/crates/openshell-core/src/secrets.rs similarity index 99% rename from crates/openshell-sandbox/src/secrets.rs rename to crates/openshell-core/src/secrets.rs index de7804393..42fefb944 100644 --- a/crates/openshell-sandbox/src/secrets.rs +++ b/crates/openshell-core/src/secrets.rs @@ -117,13 +117,13 @@ impl fmt::Debug for SecretResolver { impl SecretResolver { #[cfg_attr(not(test), allow(dead_code))] - pub(crate) fn from_provider_env( + pub fn from_provider_env( provider_env: HashMap, ) -> (HashMap, Option) { Self::from_provider_env_for_revision(provider_env, HashMap::new(), 0) } - pub(crate) fn from_provider_env_for_revision( + pub fn from_provider_env_for_revision( provider_env: HashMap, credential_expires_at_ms: HashMap, revision: u64, @@ -136,7 +136,7 @@ impl SecretResolver { ) } - pub(crate) fn from_provider_env_for_current_revision( + pub fn from_provider_env_for_current_revision( provider_env: HashMap, credential_expires_at_ms: HashMap, revision: u64, @@ -201,7 +201,7 @@ impl SecretResolver { (child_env, Some(Self { by_placeholder })) } - pub(crate) fn merge<'a>(resolvers: impl IntoIterator) -> Option { + pub fn merge<'a>(resolvers: impl IntoIterator) -> Option { let mut by_placeholder = HashMap::new(); for resolver in resolvers { by_placeholder.extend(resolver.by_placeholder.clone()); @@ -217,7 +217,7 @@ impl SecretResolver { /// /// Returns `None` if the placeholder is unknown or the resolved value /// contains prohibited control characters (CRLF, null byte). - pub(crate) fn resolve_placeholder(&self, value: &str) -> Option<&str> { + pub fn resolve_placeholder(&self, value: &str) -> Option<&str> { let secret = if let Some(secret) = self.by_placeholder.get(value) { secret } else { @@ -245,7 +245,7 @@ impl SecretResolver { } } - pub(crate) fn rewrite_header_value( + pub fn rewrite_header_value( &self, value: &str, ) -> Result, UnresolvedPlaceholderError> { @@ -287,7 +287,7 @@ impl SecretResolver { Ok(None) } - pub(crate) fn rewrite_text_placeholders( + pub fn rewrite_text_placeholders( &self, text: &mut String, location: &'static str, @@ -352,7 +352,7 @@ impl SecretResolver { /// The message is mutated only after all placeholders resolve /// successfully. The return value is the number of replacements; callers /// must not log the rewritten text. - pub(crate) fn rewrite_websocket_text_placeholders( + pub fn rewrite_websocket_text_placeholders( &self, text: &mut String, ) -> Result { diff --git a/crates/openshell-sandbox/src/l7/relay.rs b/crates/openshell-sandbox/src/l7/relay.rs index 6d271af21..90f1bfc7e 100644 --- a/crates/openshell-sandbox/src/l7/relay.rs +++ b/crates/openshell-sandbox/src/l7/relay.rs @@ -11,7 +11,7 @@ use crate::l7::provider::{L7Provider, RelayOutcome}; use crate::l7::rest::WebSocketExtensionMode; use crate::l7::{EnforcementMode, L7EndpointConfig, L7Protocol, L7RequestInfo}; use crate::opa::{PolicyGenerationGuard, TunnelPolicyEngine}; -use crate::secrets::{self, SecretResolver}; +use openshell_core::secrets::{self, SecretResolver}; use miette::{IntoDiagnostic, Result, miette}; use openshell_ocsf::{ ActionId, ActivityId, DispositionId, Endpoint, HttpActivityBuilder, HttpRequest, diff --git a/crates/openshell-sandbox/src/l7/rest.rs b/crates/openshell-sandbox/src/l7/rest.rs index 20d52459c..3cf289dc5 100644 --- a/crates/openshell-sandbox/src/l7/rest.rs +++ b/crates/openshell-sandbox/src/l7/rest.rs @@ -9,7 +9,7 @@ use crate::l7::provider::{BodyLength, L7Provider, L7Request, RelayOutcome}; use crate::opa::PolicyGenerationGuard; -use crate::secrets::{ +use openshell_core::secrets::{ SecretResolver, contains_reserved_credential_marker, rewrite_http_header_block, }; use base64::Engine as _; @@ -2001,7 +2001,7 @@ fn is_benign_close(err: &std::io::Error) -> bool { mod tests { use super::*; use crate::opa::OpaEngine; - use crate::secrets::SecretResolver; + use openshell_core::secrets::SecretResolver; use flate2::{Compress, Compression, Decompress, FlushCompress, FlushDecompress, Status}; use std::sync::Arc; diff --git a/crates/openshell-sandbox/src/l7/websocket.rs b/crates/openshell-sandbox/src/l7/websocket.rs index 2dc1b25c3..dea1ead35 100644 --- a/crates/openshell-sandbox/src/l7/websocket.rs +++ b/crates/openshell-sandbox/src/l7/websocket.rs @@ -9,7 +9,7 @@ use crate::l7::relay::{L7EvalContext, evaluate_l7_request}; use crate::l7::{EnforcementMode, L7RequestInfo}; use crate::opa::TunnelPolicyEngine; -use crate::secrets::SecretResolver; +use openshell_core::secrets::SecretResolver; use flate2::{Compress, Compression, Decompress, FlushCompress, FlushDecompress, Status}; use miette::{IntoDiagnostic, Result, miette}; use openshell_ocsf::{ @@ -1105,7 +1105,7 @@ mod tests { use super::*; use crate::l7::relay::L7EvalContext; use crate::opa::{NetworkInput, OpaEngine}; - use crate::secrets::SecretResolver; + use openshell_core::secrets::SecretResolver; use std::path::PathBuf; use tokio::io::{AsyncReadExt, AsyncWriteExt}; diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 8c13b25f0..65e36b85b 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -17,7 +17,6 @@ mod process; mod provider_credentials; pub mod proxy; mod sandbox; -mod secrets; mod ssh; mod supervisor_session; diff --git a/crates/openshell-sandbox/src/provider_credentials.rs b/crates/openshell-sandbox/src/provider_credentials.rs index ae91e8d6e..f82dfa702 100644 --- a/crates/openshell-sandbox/src/provider_credentials.rs +++ b/crates/openshell-sandbox/src/provider_credentials.rs @@ -3,7 +3,7 @@ //! Runtime provider credential snapshots. -use crate::secrets::SecretResolver; +use openshell_core::secrets::SecretResolver; use std::collections::{HashMap, VecDeque}; use std::sync::{Arc, RwLock}; diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index 570311738..b4d82a252 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -9,9 +9,9 @@ use crate::l7::tls::ProxyTlsState; use crate::opa::{NetworkAction, OpaEngine, PolicyGenerationGuard}; use crate::policy_local::{POLICY_LOCAL_HOST, PolicyLocalContext}; use crate::provider_credentials::ProviderCredentialState; -use crate::secrets::{SecretResolver, rewrite_header_line_checked}; use miette::{IntoDiagnostic, Result}; use openshell_core::net::{is_always_blocked_ip, is_internal_ip, is_link_local_ip}; +use openshell_core::secrets::{SecretResolver, rewrite_header_line_checked}; use openshell_core::policy::ProxyPolicy; use openshell_ocsf::{ ActionId, ActivityId, DispositionId, Endpoint, HttpActivityBuilder, HttpRequest, @@ -2663,14 +2663,14 @@ fn rewrite_forward_request( path: &str, secret_resolver: Option<&SecretResolver>, request_body_credential_rewrite: bool, -) -> Result, crate::secrets::UnresolvedPlaceholderError> { +) -> Result, openshell_core::secrets::UnresolvedPlaceholderError> { let header_end = raw[..used] .windows(4) .position(|w| w == b"\r\n\r\n") .map_or(used, |p| p + 4); let websocket_upgrade = crate::l7::rest::request_is_websocket_upgrade(&raw[..header_end]); let upstream_path = match secret_resolver { - Some(resolver) => crate::secrets::rewrite_target_for_eval(path, resolver)?.resolved, + Some(resolver) => openshell_core::secrets::rewrite_target_for_eval(path, resolver)?.resolved, None => path.to_string(), }; @@ -2763,10 +2763,10 @@ fn rewrite_forward_request( output.len() }; let output_str = String::from_utf8_lossy(&output[..scan_end]); - if output_str.contains(crate::secrets::PLACEHOLDER_PREFIX_PUBLIC) - || output_str.contains(crate::secrets::PROVIDER_ALIAS_MARKER_PUBLIC) + if output_str.contains(openshell_core::secrets::PLACEHOLDER_PREFIX_PUBLIC) + || output_str.contains(openshell_core::secrets::PROVIDER_ALIAS_MARKER_PUBLIC) { - return Err(crate::secrets::UnresolvedPlaceholderError { location: "header" }); + return Err(openshell_core::secrets::UnresolvedPlaceholderError { location: "header" }); } } From 960bf68baa894e59fe0216aa8ad6c72faedc7776 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 20:24:57 +0300 Subject: [PATCH 14/49] refactor(core): lift provider_credentials to openshell-core Move crates/openshell-sandbox/src/provider_credentials.rs to crates/openshell-core/src/provider_credentials.rs. Both supervisor leaves now name ProviderCredentialState in their function signatures (run_networking takes &ProviderCredentialState, run_process takes ProviderCredentialState by value), and under Shape A leaves can't depend on openshell-sandbox, so the type must live in openshell-core. The orchestrator (run_sandbox in openshell-sandbox) remains the only writer: it constructs ProviderCredentialState::from_environment and the policy poll loop calls install_environment on credential rotation. Both leaves stay pure readers via snapshot()/resolver(). Update import paths in proxy.rs, ssh.rs, and lib.rs from crate::provider_credentials to openshell_core::provider_credentials. Signed-off-by: Radoslav Hubenov --- crates/openshell-core/src/lib.rs | 1 + .../src/provider_credentials.rs | 2 +- crates/openshell-sandbox/src/l7/relay.rs | 2 +- crates/openshell-sandbox/src/l7/rest.rs | 6 ++-- crates/openshell-sandbox/src/l7/websocket.rs | 2 +- crates/openshell-sandbox/src/lib.rs | 20 +++++------ crates/openshell-sandbox/src/proxy.rs | 36 +++++++++++-------- crates/openshell-sandbox/src/ssh.rs | 6 ++-- 8 files changed, 41 insertions(+), 34 deletions(-) rename crates/{openshell-sandbox => openshell-core}/src/provider_credentials.rs (99%) diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index b98186ca4..e34a1ab39 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -25,6 +25,7 @@ pub mod policy; pub mod procfs; pub mod progress; pub mod proto; +pub mod provider_credentials; pub mod sandbox_env; pub mod secrets; pub mod settings; diff --git a/crates/openshell-sandbox/src/provider_credentials.rs b/crates/openshell-core/src/provider_credentials.rs similarity index 99% rename from crates/openshell-sandbox/src/provider_credentials.rs rename to crates/openshell-core/src/provider_credentials.rs index f82dfa702..ae91e8d6e 100644 --- a/crates/openshell-sandbox/src/provider_credentials.rs +++ b/crates/openshell-core/src/provider_credentials.rs @@ -3,7 +3,7 @@ //! Runtime provider credential snapshots. -use openshell_core::secrets::SecretResolver; +use crate::secrets::SecretResolver; use std::collections::{HashMap, VecDeque}; use std::sync::{Arc, RwLock}; diff --git a/crates/openshell-sandbox/src/l7/relay.rs b/crates/openshell-sandbox/src/l7/relay.rs index 90f1bfc7e..21e3133c2 100644 --- a/crates/openshell-sandbox/src/l7/relay.rs +++ b/crates/openshell-sandbox/src/l7/relay.rs @@ -11,8 +11,8 @@ use crate::l7::provider::{L7Provider, RelayOutcome}; use crate::l7::rest::WebSocketExtensionMode; use crate::l7::{EnforcementMode, L7EndpointConfig, L7Protocol, L7RequestInfo}; use crate::opa::{PolicyGenerationGuard, TunnelPolicyEngine}; -use openshell_core::secrets::{self, SecretResolver}; use miette::{IntoDiagnostic, Result, miette}; +use openshell_core::secrets::{self, SecretResolver}; use openshell_ocsf::{ ActionId, ActivityId, DispositionId, Endpoint, HttpActivityBuilder, HttpRequest, NetworkActivityBuilder, SeverityId, StatusId, Url as OcsfUrl, ocsf_emit, diff --git a/crates/openshell-sandbox/src/l7/rest.rs b/crates/openshell-sandbox/src/l7/rest.rs index 3cf289dc5..2216a378e 100644 --- a/crates/openshell-sandbox/src/l7/rest.rs +++ b/crates/openshell-sandbox/src/l7/rest.rs @@ -9,11 +9,11 @@ use crate::l7::provider::{BodyLength, L7Provider, L7Request, RelayOutcome}; use crate::opa::PolicyGenerationGuard; +use base64::Engine as _; +use miette::{IntoDiagnostic, Result, miette}; use openshell_core::secrets::{ SecretResolver, contains_reserved_credential_marker, rewrite_http_header_block, }; -use base64::Engine as _; -use miette::{IntoDiagnostic, Result, miette}; use sha1::{Digest, Sha1}; use std::collections::{HashMap, HashSet}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; @@ -2001,8 +2001,8 @@ fn is_benign_close(err: &std::io::Error) -> bool { mod tests { use super::*; use crate::opa::OpaEngine; - use openshell_core::secrets::SecretResolver; use flate2::{Compress, Compression, Decompress, FlushCompress, FlushDecompress, Status}; + use openshell_core::secrets::SecretResolver; use std::sync::Arc; const TEST_POLICY: &str = include_str!("../../data/sandbox-policy.rego"); diff --git a/crates/openshell-sandbox/src/l7/websocket.rs b/crates/openshell-sandbox/src/l7/websocket.rs index dea1ead35..876d86d00 100644 --- a/crates/openshell-sandbox/src/l7/websocket.rs +++ b/crates/openshell-sandbox/src/l7/websocket.rs @@ -9,9 +9,9 @@ use crate::l7::relay::{L7EvalContext, evaluate_l7_request}; use crate::l7::{EnforcementMode, L7RequestInfo}; use crate::opa::TunnelPolicyEngine; -use openshell_core::secrets::SecretResolver; use flate2::{Compress, Compression, Decompress, FlushCompress, FlushDecompress, Status}; use miette::{IntoDiagnostic, Result, miette}; +use openshell_core::secrets::SecretResolver; use openshell_ocsf::{ ActionId, ActivityId, DispositionId, Endpoint, NetworkActivityBuilder, SeverityId, StatusId, ocsf_emit, diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 65e36b85b..0da43fdd6 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -14,7 +14,6 @@ pub mod log_push; pub mod opa; mod policy_local; mod process; -mod provider_credentials; pub mod proxy; mod sandbox; mod ssh; @@ -98,18 +97,19 @@ pub(crate) use openshell_supervisor_process::proposals::{ #[cfg(test)] pub(crate) use openshell_supervisor_process::proposals::test_helpers; -use openshell_supervisor_networking::identity::BinaryIdentityCache; use crate::l7::tls::{ CertCache, ProxyTlsState, SandboxCa, build_upstream_client_config, read_system_ca_bundle, write_ca_files, }; use crate::opa::OpaEngine; use crate::proxy::ProxyHandle; -use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; -use openshell_supervisor_process::skills; -use openshell_supervisor_networking::mechanistic_mapper; #[cfg(target_os = "linux")] use crate::sandbox::linux::netns::NetworkNamespace; +use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; +use openshell_core::provider_credentials::ProviderCredentialState; +use openshell_supervisor_networking::identity::BinaryIdentityCache; +use openshell_supervisor_networking::mechanistic_mapper; +use openshell_supervisor_process::skills; pub use process::{ProcessHandle, ProcessStatus}; pub use sandbox::apply_supervisor_startup_hardening; @@ -283,7 +283,7 @@ async fn run_networking( #[cfg(target_os = "linux")] netns: Option<&NetworkNamespace>, opa_engine: Option<&Arc>, entrypoint_pid: Arc, - provider_credentials: &provider_credentials::ProviderCredentialState, + provider_credentials: &ProviderCredentialState, policy_local_ctx: &Arc, sandbox_id: Option<&str>, openshell_endpoint: Option<&str>, @@ -501,7 +501,7 @@ async fn run_process( opa_engine: Option<&Arc>, retained_proto: Option<&openshell_core::proto::SandboxPolicy>, entrypoint_pid: Arc, - provider_credentials: provider_credentials::ProviderCredentialState, + provider_credentials: ProviderCredentialState, provider_env: std::collections::HashMap, policy_local_ctx: Arc, ocsf_enabled: Arc, @@ -987,7 +987,7 @@ pub async fn run_sandbox( ) }; - let provider_credentials = provider_credentials::ProviderCredentialState::from_environment( + let provider_credentials = ProviderCredentialState::from_environment( provider_env_revision, provider_env, provider_credential_expires_at_ms, @@ -2444,7 +2444,7 @@ struct PolicyPollLoopContext { entrypoint_pid: Arc, interval_secs: u64, ocsf_enabled: Arc, - provider_credentials: provider_credentials::ProviderCredentialState, + provider_credentials: ProviderCredentialState, policy_local_ctx: Option>, } @@ -2775,9 +2775,9 @@ fn format_setting_value(es: &openshell_core::proto::EffectiveSetting) -> String )] mod tests { use super::*; - use openshell_core::policy::{FilesystemPolicy, LandlockPolicy, ProcessPolicy}; #[cfg(unix)] use nix::unistd::{Group, User}; + use openshell_core::policy::{FilesystemPolicy, LandlockPolicy, ProcessPolicy}; #[cfg(unix)] use std::os::unix::fs::{MetadataExt, symlink}; use temp_env::with_vars; diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index b4d82a252..5ff2ab2eb 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -4,19 +4,19 @@ //! HTTP CONNECT proxy with OPA policy evaluation and process-identity binding. use crate::denial_aggregator::DenialEvent; -use openshell_supervisor_networking::identity::BinaryIdentityCache; use crate::l7::tls::ProxyTlsState; use crate::opa::{NetworkAction, OpaEngine, PolicyGenerationGuard}; use crate::policy_local::{POLICY_LOCAL_HOST, PolicyLocalContext}; -use crate::provider_credentials::ProviderCredentialState; use miette::{IntoDiagnostic, Result}; use openshell_core::net::{is_always_blocked_ip, is_internal_ip, is_link_local_ip}; -use openshell_core::secrets::{SecretResolver, rewrite_header_line_checked}; use openshell_core::policy::ProxyPolicy; +use openshell_core::provider_credentials::ProviderCredentialState; +use openshell_core::secrets::{SecretResolver, rewrite_header_line_checked}; use openshell_ocsf::{ ActionId, ActivityId, DispositionId, Endpoint, HttpActivityBuilder, HttpRequest, NetworkActivityBuilder, Process, SeverityId, StatusId, Url as OcsfUrl, ocsf_emit, }; +use openshell_supervisor_networking::identity::BinaryIdentityCache; use std::net::{IpAddr, SocketAddr}; use std::path::PathBuf; use std::sync::Arc; @@ -1178,13 +1178,14 @@ fn resolve_owner_identity( entrypoint_pid: u32, identity_cache: &BinaryIdentityCache, ) -> std::result::Result { - let bin_path = - openshell_core::procfs::binary_path(owner_pid.cast_signed()).map_err(|e| IdentityError { + let bin_path = openshell_core::procfs::binary_path(owner_pid.cast_signed()).map_err(|e| { + IdentityError { reason: format!("failed to resolve peer binary for PID {owner_pid}: {e}"), binary: None, binary_pid: Some(owner_pid), ancestors: vec![], - })?; + } + })?; let bin_hash = identity_cache .verify_or_cache(&bin_path) @@ -1213,7 +1214,8 @@ fn resolve_owner_identity( let mut exclude = ancestors.clone(); exclude.push(bin_path.clone()); - let cmdline_paths = openshell_core::procfs::collect_cmdline_paths(owner_pid, entrypoint_pid, &exclude); + let cmdline_paths = + openshell_core::procfs::collect_cmdline_paths(owner_pid, entrypoint_pid, &exclude); Ok(ResolvedIdentity { bin_path, @@ -1243,13 +1245,15 @@ fn resolve_process_identity( peer_port: u16, identity_cache: &BinaryIdentityCache, ) -> std::result::Result { - let socket_owners = openshell_core::procfs::resolve_tcp_peer_socket_owners(entrypoint_pid, peer_port) - .map_err(|e| IdentityError { - reason: format!("failed to resolve peer binary: {e}"), - binary: None, - binary_pid: None, - ancestors: vec![], - })?; + let socket_owners = + openshell_core::procfs::resolve_tcp_peer_socket_owners(entrypoint_pid, peer_port).map_err( + |e| IdentityError { + reason: format!("failed to resolve peer binary: {e}"), + binary: None, + binary_pid: None, + ancestors: vec![], + }, + )?; let mut identities = Vec::with_capacity(socket_owners.owners.len()); for owner in &socket_owners.owners { @@ -2670,7 +2674,9 @@ fn rewrite_forward_request( .map_or(used, |p| p + 4); let websocket_upgrade = crate::l7::rest::request_is_websocket_upgrade(&raw[..header_end]); let upstream_path = match secret_resolver { - Some(resolver) => openshell_core::secrets::rewrite_target_for_eval(path, resolver)?.resolved, + Some(resolver) => { + openshell_core::secrets::rewrite_target_for_eval(path, resolver)?.resolved + } None => path.to_string(), }; diff --git a/crates/openshell-sandbox/src/ssh.rs b/crates/openshell-sandbox/src/ssh.rs index 3a2a1142b..c350f4105 100644 --- a/crates/openshell-sandbox/src/ssh.rs +++ b/crates/openshell-sandbox/src/ssh.rs @@ -4,18 +4,18 @@ //! Embedded SSH server for sandbox access. use crate::process::drop_privileges; -use crate::provider_credentials::ProviderCredentialState; use crate::sandbox; -use openshell_core::policy::SandboxPolicy; -use openshell_supervisor_process::child_env; #[cfg(target_os = "linux")] use crate::{register_managed_child, unregister_managed_child}; use miette::{IntoDiagnostic, Result}; use nix::pty::{Winsize, openpty}; use nix::unistd::setsid; +use openshell_core::policy::SandboxPolicy; +use openshell_core::provider_credentials::ProviderCredentialState; use openshell_ocsf::{ ActionId, ActivityId, DispositionId, SeverityId, SshActivityBuilder, StatusId, ocsf_emit, }; +use openshell_supervisor_process::child_env; use rand_core::OsRng; use russh::keys::{Algorithm, PrivateKey}; use russh::server::{Auth, Handle, Session}; From b9dc830e0999b34d3278392b46ef7248ca131502 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 20:25:01 +0300 Subject: [PATCH 15/49] style: rustfmt import ordering Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/opa.rs | 4 +++- crates/openshell-sandbox/src/process.rs | 10 +++++----- crates/openshell-sandbox/src/sandbox/linux/landlock.rs | 2 +- crates/openshell-supervisor-networking/src/identity.rs | 2 +- crates/openshell-supervisor-networking/src/lib.rs | 2 +- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/crates/openshell-sandbox/src/opa.rs b/crates/openshell-sandbox/src/opa.rs index 132d8869d..ecea499e0 100644 --- a/crates/openshell-sandbox/src/opa.rs +++ b/crates/openshell-sandbox/src/opa.rs @@ -8,7 +8,9 @@ //! on every proxy CONNECT request. use miette::Result; -use openshell_core::policy::{FilesystemPolicy, LandlockCompatibility, LandlockPolicy, ProcessPolicy}; +use openshell_core::policy::{ + FilesystemPolicy, LandlockCompatibility, LandlockPolicy, ProcessPolicy, +}; use openshell_core::proto::SandboxPolicy as ProtoSandboxPolicy; use std::path::{Path, PathBuf}; use std::sync::{ diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index 69b6649d2..2e6f78fd2 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -6,13 +6,13 @@ use crate::sandbox; #[cfg(target_os = "linux")] use crate::sandbox::linux::netns::NetworkNamespace; -use openshell_core::policy::{NetworkMode, SandboxPolicy}; -use openshell_supervisor_process::child_env; #[cfg(target_os = "linux")] use crate::{register_managed_child, unregister_managed_child}; use miette::{IntoDiagnostic, Result}; use nix::sys::signal::{self, Signal}; use nix::unistd::{Group, Pid, User}; +use openshell_core::policy::{NetworkMode, SandboxPolicy}; +use openshell_supervisor_process::child_env; use std::collections::HashMap; use std::ffi::CString; #[cfg(target_os = "linux")] @@ -658,13 +658,13 @@ impl From for ProcessStatus { #[cfg(test)] mod tests { use super::*; - use openshell_core::policy::{ - FilesystemPolicy, LandlockPolicy, NetworkPolicy, ProcessPolicy, SandboxPolicy, - }; #[cfg(unix)] use nix::sys::wait::{WaitStatus, waitpid}; #[cfg(unix)] use nix::unistd::{ForkResult, fork}; + use openshell_core::policy::{ + FilesystemPolicy, LandlockPolicy, NetworkPolicy, ProcessPolicy, SandboxPolicy, + }; #[cfg(unix)] use std::mem::size_of; use std::process::Stdio as StdStdio; diff --git a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs index fc5e660d3..95f607c91 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs @@ -7,8 +7,8 @@ use landlock::{ ABI, Access, AccessFs, CompatLevel, Compatible, PathBeneath, PathFd, PathFdError, Ruleset, RulesetAttr, RulesetCreatedAttr, }; -use openshell_core::policy::{LandlockCompatibility, SandboxPolicy}; use miette::{IntoDiagnostic, Result}; +use openshell_core::policy::{LandlockCompatibility, SandboxPolicy}; use std::path::{Path, PathBuf}; use tracing::debug; diff --git a/crates/openshell-supervisor-networking/src/identity.rs b/crates/openshell-supervisor-networking/src/identity.rs index 73a62ff8d..7f5d467d8 100644 --- a/crates/openshell-supervisor-networking/src/identity.rs +++ b/crates/openshell-supervisor-networking/src/identity.rs @@ -8,8 +8,8 @@ //! path must match the cached hash. A mismatch indicates the binary was replaced //! mid-sandbox and the request is denied. -use openshell_core::procfs; use miette::Result; +use openshell_core::procfs; use std::collections::HashMap; use std::fs::Metadata; #[cfg(unix)] diff --git a/crates/openshell-supervisor-networking/src/lib.rs b/crates/openshell-supervisor-networking/src/lib.rs index 5b1f907e6..990e6caae 100644 --- a/crates/openshell-supervisor-networking/src/lib.rs +++ b/crates/openshell-supervisor-networking/src/lib.rs @@ -7,5 +7,5 @@ //! inference routing, TLS interception, and denial aggregation. Populated by //! follow-up commits as modules migrate out of `openshell-sandbox`. -pub mod mechanistic_mapper; pub mod identity; +pub mod mechanistic_mapper; From d1d40f73029a761ef70dc7b20f169239c2d42294 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 20:57:42 +0300 Subject: [PATCH 16/49] refactor(ocsf): move SandboxContext singleton from openshell-sandbox Move the process-wide OCSF SandboxContext OnceLock + LazyLock fallback + getter from openshell-sandbox/src/lib.rs into a new openshell-ocsf::ctx module. The type already lives in openshell-ocsf, so its singleton lives next to it. Add openshell_ocsf::ctx::set_ctx() and openshell_ocsf::ctx::ctx(). The orchestrator (run_sandbox) now calls set_ctx during startup. Sandbox keeps a pub(crate) use openshell_ocsf::ctx::ctx as ocsf_ctx; re-export so the 138 existing crate::ocsf_ctx() call sites resolve unchanged. When the sandbox modules themselves migrate into the leaf crates, they'll import openshell_ocsf::ctx directly and the re-export goes away. Under Shape A neither leaf can depend on openshell-sandbox; both already depend on openshell-ocsf to construct events, so this adds no new dep edge. Signed-off-by: Radoslav Hubenov --- crates/openshell-ocsf/src/ctx.rs | 41 +++++++++++++++++++++++ crates/openshell-ocsf/src/lib.rs | 1 + crates/openshell-sandbox/src/lib.rs | 51 +++++++++-------------------- 3 files changed, 57 insertions(+), 36 deletions(-) create mode 100644 crates/openshell-ocsf/src/ctx.rs diff --git a/crates/openshell-ocsf/src/ctx.rs b/crates/openshell-ocsf/src/ctx.rs new file mode 100644 index 000000000..6916c5521 --- /dev/null +++ b/crates/openshell-ocsf/src/ctx.rs @@ -0,0 +1,41 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Process-wide [`SandboxContext`] singleton. +//! +//! Initialised once via [`set_ctx`] during sandbox start; read by every event +//! builder via [`ctx`]. Falls back to a default context when the singleton has +//! not been set (e.g. unit tests that exercise builders without booting the +//! sandbox). + +use crate::SandboxContext; +use std::sync::{LazyLock, OnceLock}; + +static OCSF_CTX: OnceLock = OnceLock::new(); + +static OCSF_CTX_FALLBACK: LazyLock = LazyLock::new(|| SandboxContext { + sandbox_id: String::new(), + sandbox_name: String::new(), + container_image: String::new(), + hostname: "test".to_string(), + product_version: env!("CARGO_PKG_VERSION").to_string(), + proxy_ip: std::net::IpAddr::from([127, 0, 0, 1]), + proxy_port: 3128, +}); + +/// Initialise the process-wide OCSF sandbox context. +/// +/// Returns `false` if the context was already set; the caller may log and +/// continue. Intended to be called exactly once during sandbox startup. +pub fn set_ctx(ctx: SandboxContext) -> bool { + OCSF_CTX.set(ctx).is_ok() +} + +/// Return a reference to the process-wide [`SandboxContext`]. +/// +/// Falls back to a default context if [`set_ctx`] has not been called (e.g. +/// during unit tests that exercise individual builders). +#[must_use] +pub fn ctx() -> &'static SandboxContext { + OCSF_CTX.get().unwrap_or(&OCSF_CTX_FALLBACK) +} diff --git a/crates/openshell-ocsf/src/lib.rs b/crates/openshell-ocsf/src/lib.rs index b9000afcf..e9d1402a6 100644 --- a/crates/openshell-ocsf/src/lib.rs +++ b/crates/openshell-ocsf/src/lib.rs @@ -25,6 +25,7 @@ pub const OCSF_VERSION: &str = "1.7.0"; pub mod builders; +pub mod ctx; pub mod enums; pub mod events; pub mod format; diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 0da43fdd6..33e447e7d 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -25,10 +25,8 @@ use std::collections::HashSet; use std::future::Future; use std::net::SocketAddr; use std::sync::Arc; -use std::sync::LazyLock; #[cfg(any(target_os = "linux", test))] use std::sync::Mutex; -use std::sync::OnceLock; use std::sync::atomic::{AtomicU32, Ordering}; use std::time::Duration; use tokio::time::timeout; @@ -59,29 +57,12 @@ use openshell_ocsf::{ // policy changes, or observable sandbox behavior worth structuring. // --------------------------------------------------------------------------- -/// Process-wide OCSF sandbox context. Initialized once during `run_sandbox()` -/// startup and accessible from any module in the crate via [`ocsf_ctx()`]. -static OCSF_CTX: OnceLock = OnceLock::new(); - -/// Fallback context used when `OCSF_CTX` has not been initialized (e.g. in -/// unit tests that exercise individual functions without calling `run_sandbox`). -static OCSF_CTX_FALLBACK: LazyLock = LazyLock::new(|| SandboxContext { - sandbox_id: String::new(), - sandbox_name: String::new(), - container_image: String::new(), - hostname: "test".to_string(), - product_version: openshell_core::VERSION.to_string(), - proxy_ip: std::net::IpAddr::from([127, 0, 0, 1]), - proxy_port: 3128, -}); - -/// Return a reference to the process-wide [`SandboxContext`]. +/// Re-export the process-wide OCSF sandbox context getter. /// -/// Falls back to a default context if `run_sandbox()` has not yet been called -/// (e.g. during unit tests). -pub(crate) fn ocsf_ctx() -> &'static SandboxContext { - OCSF_CTX.get().unwrap_or(&OCSF_CTX_FALLBACK) -} +/// The singleton lives in `openshell-ocsf` so both supervisor leaves can +/// reach it without depending on `openshell-sandbox`. Initialised once during +/// `run_sandbox()` startup via `openshell_ocsf::ctx::set_ctx`. +pub(crate) use openshell_ocsf::ctx::ctx as ocsf_ctx; /// Process-wide flag for the agent-driven policy proposal surface. /// Set once during `run_sandbox()` startup and updated by the settings poll @@ -899,18 +880,15 @@ pub async fn run_sandbox( |s| s.trim().to_string(), ); - if OCSF_CTX - .set(SandboxContext { - sandbox_id: sandbox_id.clone().unwrap_or_default(), - sandbox_name: sandbox.as_deref().unwrap_or_default().to_string(), - container_image: std::env::var("OPENSHELL_CONTAINER_IMAGE").unwrap_or_default(), - hostname, - product_version: openshell_core::VERSION.to_string(), - proxy_ip: std::net::IpAddr::from([127, 0, 0, 1]), - proxy_port: 3128, - }) - .is_err() - { + if !openshell_ocsf::ctx::set_ctx(SandboxContext { + sandbox_id: sandbox_id.clone().unwrap_or_default(), + sandbox_name: sandbox.as_deref().unwrap_or_default().to_string(), + container_image: std::env::var("OPENSHELL_CONTAINER_IMAGE").unwrap_or_default(), + hostname, + product_version: openshell_core::VERSION.to_string(), + proxy_ip: std::net::IpAddr::from([127, 0, 0, 1]), + proxy_port: 3128, + }) { debug!("OCSF context already initialized, keeping existing"); } } @@ -2780,6 +2758,7 @@ mod tests { use openshell_core::policy::{FilesystemPolicy, LandlockPolicy, ProcessPolicy}; #[cfg(unix)] use std::os::unix::fs::{MetadataExt, symlink}; + use std::sync::LazyLock; use temp_env::with_vars; static ENV_LOCK: LazyLock> = LazyLock::new(|| Mutex::new(())); From 57a97bb346a88ce24fea56bdb6c2e4df3a2c9bcc Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 21:29:05 +0300 Subject: [PATCH 17/49] refactor(core): lift grpc_client to openshell-core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both prospective leaves (supervisor-networking and supervisor-process) need CachedOpenShellClient, AuthedChannel, and the connect/fetch helpers. Under Shape A the leaves cannot depend on openshell-sandbox, so the type has to live below them. openshell-core already pulls in tonic and miette; this enables tonic's channel/tls features and adds tokio as a direct dep. Updates all crate::grpc_client::* call sites in openshell-sandbox to openshell_core::grpc_client::*. No re-export shim — the call-site count was small enough to update directly. See architecture/plans/sandbox-split-design-choices.md for the full rationale and trade-offs. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 1 + crates/openshell-core/Cargo.toml | 3 +- .../src/grpc_client.rs | 8 ++-- crates/openshell-core/src/lib.rs | 1 + crates/openshell-sandbox/src/debug_rpc.rs | 2 +- crates/openshell-sandbox/src/lib.rs | 38 +++++++++++++------ crates/openshell-sandbox/src/log_push.rs | 2 +- crates/openshell-sandbox/src/policy_local.rs | 6 +-- .../src/supervisor_session.rs | 2 +- 9 files changed, 40 insertions(+), 23 deletions(-) rename crates/{openshell-sandbox => openshell-core}/src/grpc_client.rs (99%) diff --git a/Cargo.lock b/Cargo.lock index a12d66ca3..e13c91bf3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3469,6 +3469,7 @@ dependencies = [ "sha2 0.10.9", "tempfile", "thiserror 2.0.18", + "tokio", "tonic", "tonic-build", "tracing", diff --git a/crates/openshell-core/Cargo.toml b/crates/openshell-core/Cargo.toml index 729aef3d3..4f4acf0b3 100644 --- a/crates/openshell-core/Cargo.toml +++ b/crates/openshell-core/Cargo.toml @@ -13,7 +13,8 @@ repository.workspace = true [dependencies] prost = { workspace = true } prost-types = { workspace = true } -tonic = { workspace = true } +tonic = { workspace = true, features = ["channel", "tls"] } +tokio = { workspace = true } thiserror = { workspace = true } miette = { workspace = true } serde = { workspace = true } diff --git a/crates/openshell-sandbox/src/grpc_client.rs b/crates/openshell-core/src/grpc_client.rs similarity index 99% rename from crates/openshell-sandbox/src/grpc_client.rs rename to crates/openshell-core/src/grpc_client.rs index 14a6808c1..b68137833 100644 --- a/crates/openshell-sandbox/src/grpc_client.rs +++ b/crates/openshell-core/src/grpc_client.rs @@ -22,15 +22,15 @@ use std::collections::HashMap; use std::sync::{Arc, OnceLock, RwLock}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use miette::{IntoDiagnostic, Result, WrapErr}; -use openshell_core::proto::{ +use crate::proto::{ DenialSummary, GetDraftPolicyRequest, GetInferenceBundleRequest, GetInferenceBundleResponse, GetSandboxConfigRequest, GetSandboxProviderEnvironmentRequest, IssueSandboxTokenRequest, PolicyChunk, PolicySource, PolicyStatus, RefreshSandboxTokenRequest, ReportPolicyStatusRequest, SandboxPolicy as ProtoSandboxPolicy, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, UpdateConfigRequest, inference_client::InferenceClient, open_shell_client::OpenShellClient, }; -use openshell_core::sandbox_env; +use crate::sandbox_env; +use miette::{IntoDiagnostic, Result, WrapErr}; use tonic::Status; use tonic::metadata::AsciiMetadataValue; use tonic::service::interceptor::InterceptedService; @@ -674,7 +674,7 @@ pub struct SettingsPollResult { pub config_revision: u64, pub policy_source: PolicySource, /// Effective settings keyed by name. - pub settings: HashMap, + pub settings: HashMap, /// When `policy_source` is `Global`, the version of the global policy revision. pub global_policy_version: u32, pub provider_env_revision: u64, diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index e34a1ab39..19b15e82c 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -16,6 +16,7 @@ pub mod driver_utils; pub mod error; pub mod forward; pub mod gpu; +pub mod grpc_client; pub mod image; pub mod inference; pub mod metadata; diff --git a/crates/openshell-sandbox/src/debug_rpc.rs b/crates/openshell-sandbox/src/debug_rpc.rs index af22b7450..f583d54dc 100644 --- a/crates/openshell-sandbox/src/debug_rpc.rs +++ b/crates/openshell-sandbox/src/debug_rpc.rs @@ -24,7 +24,7 @@ use openshell_core::proto::{ }; use sha2::{Digest, Sha256}; -use crate::grpc_client::{AuthedChannel, connect_channel_pub}; +use openshell_core::grpc_client::{AuthedChannel, connect_channel_pub}; /// Entry point for the `debug-rpc` subcommand. Returns the process exit /// code; `main` propagates it. diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 33e447e7d..275cb4b36 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -8,7 +8,6 @@ pub mod bypass_monitor; pub mod debug_rpc; pub mod denial_aggregator; -mod grpc_client; pub mod l7; pub mod log_push; pub mod opa; @@ -920,7 +919,7 @@ pub async fn run_sandbox( // even if provider env fetch fails (graceful degradation). let (provider_env_revision, provider_env, provider_credential_expires_at_ms) = if let (Some(id), Some(endpoint)) = (&sandbox_id, &openshell_endpoint) { - match grpc_client::fetch_provider_environment(endpoint, id).await { + match openshell_core::grpc_client::fetch_provider_environment(endpoint, id).await { Ok(result) => { ocsf_emit!( ConfigStateChangeBuilder::new(ocsf_ctx()) @@ -1001,7 +1000,8 @@ pub async fn run_sandbox( // at startup rather than waiting for the poll loop's first tick. In // offline/file-mode there is no gateway, so the flag stays false. if let (Some(id), Some(endpoint)) = (&sandbox_id, &openshell_endpoint) - && let Ok(client) = grpc_client::CachedOpenShellClient::connect(endpoint).await + && let Ok(client) = + openshell_core::grpc_client::CachedOpenShellClient::connect(endpoint).await && let Ok(result) = client.poll_settings(id).await { let initial = extract_bool_setting( @@ -1156,7 +1156,7 @@ async fn build_inference_context( // Cluster mode: fetch bundle from gateway info!(endpoint = %endpoint, "Fetching inference route bundle from gateway"); - match grpc_client::fetch_inference_bundle(endpoint).await { + match openshell_core::grpc_client::fetch_inference_bundle(endpoint).await { Ok(bundle) => { initial_revision = Some(bundle.revision.clone()); ocsf_emit!( @@ -1357,7 +1357,7 @@ pub(crate) fn spawn_route_refresh( loop { tick.tick().await; - match grpc_client::fetch_inference_bundle(&endpoint).await { + match openshell_core::grpc_client::fetch_inference_bundle(&endpoint).await { Ok(bundle) => { if current_revision.as_deref() == Some(&bundle.revision) { trace!(revision = %bundle.revision, "Inference bundle unchanged"); @@ -2026,8 +2026,10 @@ async fn load_policy( endpoint = %endpoint, "Fetching sandbox policy via gRPC" ); - let proto_policy = - grpc_retry("Policy fetch", || grpc_client::fetch_policy(endpoint, id)).await?; + let proto_policy = grpc_retry("Policy fetch", || { + openshell_core::grpc_client::fetch_policy(endpoint, id) + }) + .await?; let mut proto_policy = if let Some(p) = proto_policy { p @@ -2057,7 +2059,12 @@ async fn load_policy( // Sync and re-fetch over a single connection to avoid extra // TLS handshakes. grpc_retry("Policy discovery sync", || { - grpc_client::discover_and_sync_policy(endpoint, id, sandbox, &discovered) + openshell_core::grpc_client::discover_and_sync_policy( + endpoint, + id, + sandbox, + &discovered, + ) }) .await? }; @@ -2068,7 +2075,9 @@ async fn load_policy( let enriched = enrich_proto_baseline_paths(&mut proto_policy); if enriched && let Some(sandbox_name) = sandbox.as_deref() - && let Err(e) = grpc_client::sync_policy(endpoint, sandbox_name, &proto_policy).await + && let Err(e) = + openshell_core::grpc_client::sync_policy(endpoint, sandbox_name, &proto_policy) + .await { warn!( error = %e, @@ -2353,7 +2362,7 @@ async fn flush_proposals_to_gateway( sandbox_name: &str, summaries: Vec, ) -> Result<()> { - use crate::grpc_client::CachedOpenShellClient; + use openshell_core::grpc_client::CachedOpenShellClient; use openshell_core::proto::{DenialSummary, L7RequestSample}; let client = CachedOpenShellClient::connect(endpoint).await?; @@ -2427,7 +2436,7 @@ struct PolicyPollLoopContext { } async fn run_policy_poll_loop(ctx: PolicyPollLoopContext) -> Result<()> { - use crate::grpc_client::CachedOpenShellClient; + use openshell_core::grpc_client::CachedOpenShellClient; use openshell_core::proto::PolicySource; use std::sync::atomic::Ordering; @@ -2493,7 +2502,12 @@ async fn run_policy_poll_loop(ctx: PolicyPollLoopContext) -> Result<()> { .build()); if provider_env_changed { - match grpc_client::fetch_provider_environment(&ctx.endpoint, &ctx.sandbox_id).await { + match openshell_core::grpc_client::fetch_provider_environment( + &ctx.endpoint, + &ctx.sandbox_id, + ) + .await + { Ok(env_result) => { let env_count = ctx.provider_credentials.install_environment( env_result.provider_env_revision, diff --git a/crates/openshell-sandbox/src/log_push.rs b/crates/openshell-sandbox/src/log_push.rs index fd33d1e07..f65f30433 100644 --- a/crates/openshell-sandbox/src/log_push.rs +++ b/crates/openshell-sandbox/src/log_push.rs @@ -7,7 +7,7 @@ //! channel to a background task. The task batches lines and streams them to //! the server using the `PushSandboxLogs` client-streaming RPC. -use crate::grpc_client::CachedOpenShellClient; +use openshell_core::grpc_client::CachedOpenShellClient; use openshell_core::proto::{PushSandboxLogsRequest, SandboxLogLine}; use tokio::sync::mpsc; use tracing::{Event, Subscriber}; diff --git a/crates/openshell-sandbox/src/policy_local.rs b/crates/openshell-sandbox/src/policy_local.rs index fcf6e1f8e..9d570dfd0 100644 --- a/crates/openshell-sandbox/src/policy_local.rs +++ b/crates/openshell-sandbox/src/policy_local.rs @@ -484,7 +484,7 @@ async fn submit_proposal(ctx: &PolicyLocalContext, body: &[u8]) -> (u16, serde_j Err(error) => return (400, error_payload("invalid_proposal", error)), }; - let client = match crate::grpc_client::CachedOpenShellClient::connect(endpoint).await { + let client = match openshell_core::grpc_client::CachedOpenShellClient::connect(endpoint).await { Ok(client) => client, Err(error) => { return ( @@ -877,7 +877,7 @@ fn parse_timeout_query(query: &str) -> u64 { /// per request and reused for every `fetch_chunk` call in a wait loop so a /// 60-second wait does one TLS handshake, not sixty. struct LookupSession<'a> { - client: crate::grpc_client::CachedOpenShellClient, + client: openshell_core::grpc_client::CachedOpenShellClient, sandbox_name: &'a str, } @@ -909,7 +909,7 @@ async fn open_lookup_session( ), ) })?; - let client = crate::grpc_client::CachedOpenShellClient::connect(endpoint) + let client = openshell_core::grpc_client::CachedOpenShellClient::connect(endpoint) .await .map_err(|e| (502, error_payload("gateway_connect_failed", e.to_string())))?; Ok(LookupSession { diff --git a/crates/openshell-sandbox/src/supervisor_session.rs b/crates/openshell-sandbox/src/supervisor_session.rs index 4d7392ee3..0310bb445 100644 --- a/crates/openshell-sandbox/src/supervisor_session.rs +++ b/crates/openshell-sandbox/src/supervisor_session.rs @@ -30,7 +30,7 @@ use tokio::sync::mpsc; use tokio_stream::StreamExt; use tracing::{debug, warn}; -use crate::grpc_client; +use openshell_core::grpc_client; const INITIAL_BACKOFF: Duration = Duration::from_secs(1); const MAX_BACKOFF: Duration = Duration::from_secs(30); From df881dfa1b5e46c6b4eb4b6cea00b404d400ea09 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 21:42:56 +0300 Subject: [PATCH 18/49] refactor(supervisor-networking): move denial_aggregator from openshell-sandbox DenialAggregator and FlushableDenialSummary belong with the proxy and L7 layer that emit denials. Moves the file into openshell-supervisor-networking; adds tokio as a regular dep there since DenialAggregator uses tokio::sync::mpsc. Drops the pub use openshell_core::DenialEvent re-export inside the moved file (no longer needed cross-crate). Updates bypass_monitor.rs, proxy.rs, and lib.rs to import openshell_core::DenialEvent directly. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 1 + crates/openshell-sandbox/src/bypass_monitor.rs | 2 +- crates/openshell-sandbox/src/lib.rs | 13 ++++++++----- crates/openshell-sandbox/src/proxy.rs | 2 +- crates/openshell-supervisor-networking/Cargo.toml | 1 + .../src/denial_aggregator.rs | 2 +- crates/openshell-supervisor-networking/src/lib.rs | 1 + 7 files changed, 14 insertions(+), 8 deletions(-) rename crates/{openshell-sandbox => openshell-supervisor-networking}/src/denial_aggregator.rs (99%) diff --git a/Cargo.lock b/Cargo.lock index e13c91bf3..24c1c4422 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3777,6 +3777,7 @@ dependencies = [ "miette", "openshell-core", "tempfile", + "tokio", "tracing", ] diff --git a/crates/openshell-sandbox/src/bypass_monitor.rs b/crates/openshell-sandbox/src/bypass_monitor.rs index 3b71cd978..50504c068 100644 --- a/crates/openshell-sandbox/src/bypass_monitor.rs +++ b/crates/openshell-sandbox/src/bypass_monitor.rs @@ -16,7 +16,7 @@ //! the monitor logs a one-time warning and returns. The nftables reject rules //! still provide fast-fail UX — the monitor only adds diagnostic visibility. -use crate::denial_aggregator::DenialEvent; +use openshell_core::DenialEvent; use openshell_ocsf::{ ActionId, ActivityId, ConfidenceId, DetectionFindingBuilder, DispositionId, Endpoint, FindingInfo, NetworkActivityBuilder, Process, SeverityId, ocsf_emit, diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 275cb4b36..26c639140 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -7,7 +7,6 @@ pub mod bypass_monitor; pub mod debug_rpc; -pub mod denial_aggregator; pub mod l7; pub mod log_push; pub mod opa; @@ -248,7 +247,7 @@ struct Networking { ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, ssh_proxy_url: Option, ssh_netns_fd: Option, - denial_rx: Option>, + denial_rx: Option>, } /// Set up the networking stack: ephemeral CA + TLS state, proxy server, @@ -489,7 +488,7 @@ async fn run_process( ssh_netns_fd: Option, ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, #[cfg(target_os = "linux")] netns: Option<&NetworkNamespace>, - denial_rx: Option>, + denial_rx: Option>, ) -> Result { // Zombie reaper — openshell-sandbox may run as PID 1 in containers and // must reap orphaned grandchildren (e.g. background daemons started by @@ -783,7 +782,11 @@ async fn run_process( .and_then(|v| v.parse().ok()) .unwrap_or(10); - let aggregator = denial_aggregator::DenialAggregator::new(rx, flush_interval_secs); + let aggregator = + openshell_supervisor_networking::denial_aggregator::DenialAggregator::new( + rx, + flush_interval_secs, + ); tokio::spawn(async move { aggregator @@ -2360,7 +2363,7 @@ fn prepare_filesystem(_policy: &SandboxPolicy) -> Result<()> { async fn flush_proposals_to_gateway( endpoint: &str, sandbox_name: &str, - summaries: Vec, + summaries: Vec, ) -> Result<()> { use openshell_core::grpc_client::CachedOpenShellClient; use openshell_core::proto::{DenialSummary, L7RequestSample}; diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index 5ff2ab2eb..15a549874 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -3,11 +3,11 @@ //! HTTP CONNECT proxy with OPA policy evaluation and process-identity binding. -use crate::denial_aggregator::DenialEvent; use crate::l7::tls::ProxyTlsState; use crate::opa::{NetworkAction, OpaEngine, PolicyGenerationGuard}; use crate::policy_local::{POLICY_LOCAL_HOST, PolicyLocalContext}; use miette::{IntoDiagnostic, Result}; +use openshell_core::DenialEvent; use openshell_core::net::{is_always_blocked_ip, is_internal_ip, is_link_local_ip}; use openshell_core::policy::ProxyPolicy; use openshell_core::provider_credentials::ProviderCredentialState; diff --git a/crates/openshell-supervisor-networking/Cargo.toml b/crates/openshell-supervisor-networking/Cargo.toml index 10011766e..5b465d6f1 100644 --- a/crates/openshell-supervisor-networking/Cargo.toml +++ b/crates/openshell-supervisor-networking/Cargo.toml @@ -14,6 +14,7 @@ rust-version.workspace = true openshell-core = { path = "../openshell-core" } miette = { workspace = true } +tokio = { workspace = true } tracing = { workspace = true } [dev-dependencies] diff --git a/crates/openshell-sandbox/src/denial_aggregator.rs b/crates/openshell-supervisor-networking/src/denial_aggregator.rs similarity index 99% rename from crates/openshell-sandbox/src/denial_aggregator.rs rename to crates/openshell-supervisor-networking/src/denial_aggregator.rs index 648095a23..c954ede30 100644 --- a/crates/openshell-sandbox/src/denial_aggregator.rs +++ b/crates/openshell-supervisor-networking/src/denial_aggregator.rs @@ -14,7 +14,7 @@ use std::future::Future; use tokio::sync::mpsc; use tracing::debug; -pub use openshell_core::DenialEvent; +use openshell_core::DenialEvent; /// Aggregated denial summary keyed by `(host, port, binary)`. #[derive(Debug, Clone)] diff --git a/crates/openshell-supervisor-networking/src/lib.rs b/crates/openshell-supervisor-networking/src/lib.rs index 990e6caae..2c9d649a8 100644 --- a/crates/openshell-supervisor-networking/src/lib.rs +++ b/crates/openshell-supervisor-networking/src/lib.rs @@ -7,5 +7,6 @@ //! inference routing, TLS interception, and denial aggregation. Populated by //! follow-up commits as modules migrate out of `openshell-sandbox`. +pub mod denial_aggregator; pub mod identity; pub mod mechanistic_mapper; From 3b70ad8a8a96fcf191aa5a81eefb08785cc5da27 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 21:49:24 +0300 Subject: [PATCH 19/49] refactor(supervisor-process): move log_push from openshell-sandbox LogPushLayer is a process-side tracing layer that streams sandbox logs to the gateway via gRPC. Moves into openshell-supervisor-process; adds openshell-core, openshell-ocsf, tokio-stream, tracing, and tracing-subscriber as direct deps there. Updates the only external call site (openshell-sandbox/src/main.rs) to import from openshell_supervisor_process::log_push. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 5 +++++ crates/openshell-sandbox/src/lib.rs | 1 - crates/openshell-sandbox/src/main.rs | 5 +++-- crates/openshell-supervisor-process/Cargo.toml | 6 ++++++ crates/openshell-supervisor-process/src/lib.rs | 1 + .../src/log_push.rs | 0 6 files changed, 15 insertions(+), 3 deletions(-) rename crates/{openshell-sandbox => openshell-supervisor-process}/src/log_push.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 24c1c4422..a33824ea9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3786,8 +3786,13 @@ name = "openshell-supervisor-process" version = "0.0.0" dependencies = [ "miette", + "openshell-core", + "openshell-ocsf", "tempfile", "tokio", + "tokio-stream", + "tracing", + "tracing-subscriber", ] [[package]] diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 26c639140..78865d89a 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -8,7 +8,6 @@ pub mod bypass_monitor; pub mod debug_rpc; pub mod l7; -pub mod log_push; pub mod opa; mod policy_local; mod process; diff --git a/crates/openshell-sandbox/src/main.rs b/crates/openshell-sandbox/src/main.rs index 3c9e21578..3c8a35456 100644 --- a/crates/openshell-sandbox/src/main.rs +++ b/crates/openshell-sandbox/src/main.rs @@ -206,11 +206,12 @@ fn main() -> Result<()> { let log_push_state = if let (Some(sandbox_id), Some(endpoint)) = (&args.sandbox_id, &args.openshell_endpoint) { - let (tx, handle) = openshell_sandbox::log_push::spawn_log_push_task( + let (tx, handle) = openshell_supervisor_process::log_push::spawn_log_push_task( endpoint.clone(), sandbox_id.clone(), ); - let layer = openshell_sandbox::log_push::LogPushLayer::new(sandbox_id.clone(), tx); + let layer = + openshell_supervisor_process::log_push::LogPushLayer::new(sandbox_id.clone(), tx); Some((layer, handle)) } else { None diff --git a/crates/openshell-supervisor-process/Cargo.toml b/crates/openshell-supervisor-process/Cargo.toml index cab945aa1..38f7c9d79 100644 --- a/crates/openshell-supervisor-process/Cargo.toml +++ b/crates/openshell-supervisor-process/Cargo.toml @@ -11,8 +11,14 @@ repository.workspace = true rust-version.workspace = true [dependencies] +openshell-core = { path = "../openshell-core" } +openshell-ocsf = { path = "../openshell-ocsf" } + miette = { workspace = true } tokio = { workspace = true } +tokio-stream = { workspace = true } +tracing = { workspace = true } +tracing-subscriber = { workspace = true } [features] ## Expose proposals::test_helpers (`ProposalsFlagGuard`) to downstream test diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs index b8b19caae..df066af8a 100644 --- a/crates/openshell-supervisor-process/src/lib.rs +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -9,5 +9,6 @@ //! `openshell-sandbox`. pub mod child_env; +pub mod log_push; pub mod proposals; pub mod skills; diff --git a/crates/openshell-sandbox/src/log_push.rs b/crates/openshell-supervisor-process/src/log_push.rs similarity index 100% rename from crates/openshell-sandbox/src/log_push.rs rename to crates/openshell-supervisor-process/src/log_push.rs From 159efcb4e3cd04bc5b92f19d6325dd3b75e69f3a Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 21:52:43 +0300 Subject: [PATCH 20/49] refactor(supervisor-process): move bypass_monitor from openshell-sandbox MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bypass_monitor reads /dev/kmsg for nftables drop log lines and emits denial events. Pure process-side concern, called only from run_networking which spawns it on the netns. Moves into openshell-supervisor-process; all deps (openshell-core, openshell-ocsf, tokio, tracing) were already declared there. Replaces crate::ocsf_ctx() shim calls inside the moved file with openshell_ocsf::ctx::ctx() — first leaf-side caller to import the OCSF context singleton directly instead of going through openshell-sandbox's re-export. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 3 +-- .../src/bypass_monitor.rs | 10 +++++----- crates/openshell-supervisor-process/src/lib.rs | 1 + 3 files changed, 7 insertions(+), 7 deletions(-) rename crates/{openshell-sandbox => openshell-supervisor-process}/src/bypass_monitor.rs (98%) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 78865d89a..6de227a22 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -5,7 +5,6 @@ //! //! This crate provides process sandboxing and monitoring capabilities. -pub mod bypass_monitor; pub mod debug_rpc; pub mod l7; pub mod opa; @@ -396,7 +395,7 @@ async fn run_networking( // tracing events for direct connection attempts that bypass the proxy. #[cfg(target_os = "linux")] let bypass_monitor_handle = netns.and_then(|ns| { - bypass_monitor::spawn( + openshell_supervisor_process::bypass_monitor::spawn( ns.name().to_string(), entrypoint_pid.clone(), bypass_denial_tx, diff --git a/crates/openshell-sandbox/src/bypass_monitor.rs b/crates/openshell-supervisor-process/src/bypass_monitor.rs similarity index 98% rename from crates/openshell-sandbox/src/bypass_monitor.rs rename to crates/openshell-supervisor-process/src/bypass_monitor.rs index 50504c068..aadafda9c 100644 --- a/crates/openshell-sandbox/src/bypass_monitor.rs +++ b/crates/openshell-supervisor-process/src/bypass_monitor.rs @@ -130,7 +130,7 @@ pub fn spawn( .status(); if !dmesg_check.is_ok_and(|s| s.success()) { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .severity(SeverityId::Low) .message( @@ -158,7 +158,7 @@ pub fn spawn( { Ok(c) => c, Err(e) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .severity(SeverityId::Low) .message(format!( @@ -171,7 +171,7 @@ pub fn spawn( }; let Some(stdout) = child.stdout.take() else { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .severity(SeverityId::Low) .message("dmesg --follow produced no stdout; bypass monitor will not run") @@ -214,7 +214,7 @@ pub fn spawn( Endpoint::from_domain(&event.dst_addr, event.dst_port) }; - let net_event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let net_event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Refuse) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -230,7 +230,7 @@ pub fn spawn( .build(); ocsf_emit!(net_event); - let finding_event = DetectionFindingBuilder::new(crate::ocsf_ctx()) + let finding_event = DetectionFindingBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs index df066af8a..4af907815 100644 --- a/crates/openshell-supervisor-process/src/lib.rs +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -8,6 +8,7 @@ //! and log push. Populated by follow-up commits as modules migrate out of //! `openshell-sandbox`. +pub mod bypass_monitor; pub mod child_env; pub mod log_push; pub mod proposals; From 1dfb8e88858152a07776058951e9b063987bdbcc Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 21:57:10 +0300 Subject: [PATCH 21/49] refactor(supervisor-process): move debug_rpc from openshell-sandbox debug_rpc is the CLI subcommand handler that exercises authenticated gRPC calls (issue-token, refresh-token, get-config, etc.). Pure process-side concern, called only from openshell-sandbox/main.rs. Adds base64, hex, serde_json, sha2, and tonic (with channel/tls features) as direct deps on openshell-supervisor-process. Updates the single call site in main.rs. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 5 +++++ crates/openshell-sandbox/src/lib.rs | 1 - crates/openshell-sandbox/src/main.rs | 2 +- crates/openshell-supervisor-process/Cargo.toml | 5 +++++ .../src/debug_rpc.rs | 0 crates/openshell-supervisor-process/src/lib.rs | 1 + 6 files changed, 12 insertions(+), 2 deletions(-) rename crates/{openshell-sandbox => openshell-supervisor-process}/src/debug_rpc.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index a33824ea9..efabc94f2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3785,12 +3785,17 @@ dependencies = [ name = "openshell-supervisor-process" version = "0.0.0" dependencies = [ + "base64 0.22.1", + "hex", "miette", "openshell-core", "openshell-ocsf", + "serde_json", + "sha2 0.10.9", "tempfile", "tokio", "tokio-stream", + "tonic", "tracing", "tracing-subscriber", ] diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 6de227a22..785e4a688 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -5,7 +5,6 @@ //! //! This crate provides process sandboxing and monitoring capabilities. -pub mod debug_rpc; pub mod l7; pub mod opa; mod policy_local; diff --git a/crates/openshell-sandbox/src/main.rs b/crates/openshell-sandbox/src/main.rs index 3c8a35456..6e82acdba 100644 --- a/crates/openshell-sandbox/src/main.rs +++ b/crates/openshell-sandbox/src/main.rs @@ -168,7 +168,7 @@ fn main() -> Result<()> { .into_diagnostic()?; return runtime.block_on(async move { let _ = rustls::crypto::ring::default_provider().install_default(); - let exit = openshell_sandbox::debug_rpc::run(&raw_args[2..]).await?; + let exit = openshell_supervisor_process::debug_rpc::run(&raw_args[2..]).await?; std::process::exit(exit); }); } diff --git a/crates/openshell-supervisor-process/Cargo.toml b/crates/openshell-supervisor-process/Cargo.toml index 38f7c9d79..7cf2bc596 100644 --- a/crates/openshell-supervisor-process/Cargo.toml +++ b/crates/openshell-supervisor-process/Cargo.toml @@ -14,9 +14,14 @@ rust-version.workspace = true openshell-core = { path = "../openshell-core" } openshell-ocsf = { path = "../openshell-ocsf" } +base64 = { workspace = true } +hex = "0.4" miette = { workspace = true } +serde_json = { workspace = true } +sha2 = { workspace = true } tokio = { workspace = true } tokio-stream = { workspace = true } +tonic = { workspace = true, features = ["channel", "tls"] } tracing = { workspace = true } tracing-subscriber = { workspace = true } diff --git a/crates/openshell-sandbox/src/debug_rpc.rs b/crates/openshell-supervisor-process/src/debug_rpc.rs similarity index 100% rename from crates/openshell-sandbox/src/debug_rpc.rs rename to crates/openshell-supervisor-process/src/debug_rpc.rs diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs index 4af907815..972606e4a 100644 --- a/crates/openshell-supervisor-process/src/lib.rs +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -10,6 +10,7 @@ pub mod bypass_monitor; pub mod child_env; +pub mod debug_rpc; pub mod log_push; pub mod proposals; pub mod skills; From db86d516bf464496cecb336f3f51f253e51d40a6 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 22:01:17 +0300 Subject: [PATCH 22/49] refactor(supervisor-process): move supervisor_session from openshell-sandbox MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit supervisor_session opens a bidirectional gRPC stream that lets the gateway initiate shells inside the sandbox. Pure process-side concern, called only from run_process. Adds uuid as a direct dep on openshell-supervisor-process. Replaces crate::ocsf_ctx() shim calls inside the moved file with openshell_ocsf::ctx::ctx() — same pattern as bypass_monitor. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 1 + crates/openshell-sandbox/src/lib.rs | 3 +- .../openshell-supervisor-process/Cargo.toml | 1 + .../openshell-supervisor-process/src/lib.rs | 1 + .../src/supervisor_session.rs | 31 +++++++++++++------ 5 files changed, 25 insertions(+), 12 deletions(-) rename crates/{openshell-sandbox => openshell-supervisor-process}/src/supervisor_session.rs (97%) diff --git a/Cargo.lock b/Cargo.lock index efabc94f2..eb9ec5401 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3798,6 +3798,7 @@ dependencies = [ "tonic", "tracing", "tracing-subscriber", + "uuid", ] [[package]] diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 785e4a688..2484d24a3 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -12,7 +12,6 @@ mod process; pub mod proxy; mod sandbox; mod ssh; -mod supervisor_session; use miette::{IntoDiagnostic, Result}; #[cfg(target_os = "linux")] @@ -625,7 +624,7 @@ async fn run_process( if let (Some(endpoint), Some(id), Some(socket)) = (openshell_endpoint, sandbox_id, ssh_socket_path.as_ref()) { - supervisor_session::spawn( + openshell_supervisor_process::supervisor_session::spawn( endpoint.to_string(), id.to_string(), socket.clone(), diff --git a/crates/openshell-supervisor-process/Cargo.toml b/crates/openshell-supervisor-process/Cargo.toml index 7cf2bc596..ede8108b9 100644 --- a/crates/openshell-supervisor-process/Cargo.toml +++ b/crates/openshell-supervisor-process/Cargo.toml @@ -24,6 +24,7 @@ tokio-stream = { workspace = true } tonic = { workspace = true, features = ["channel", "tls"] } tracing = { workspace = true } tracing-subscriber = { workspace = true } +uuid = { workspace = true } [features] ## Expose proposals::test_helpers (`ProposalsFlagGuard`) to downstream test diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs index 972606e4a..d7b61c565 100644 --- a/crates/openshell-supervisor-process/src/lib.rs +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -14,3 +14,4 @@ pub mod debug_rpc; pub mod log_push; pub mod proposals; pub mod skills; +pub mod supervisor_session; diff --git a/crates/openshell-sandbox/src/supervisor_session.rs b/crates/openshell-supervisor-process/src/supervisor_session.rs similarity index 97% rename from crates/openshell-sandbox/src/supervisor_session.rs rename to crates/openshell-supervisor-process/src/supervisor_session.rs index 0310bb445..7c524676c 100644 --- a/crates/openshell-sandbox/src/supervisor_session.rs +++ b/crates/openshell-supervisor-process/src/supervisor_session.rs @@ -258,13 +258,18 @@ async fn run_session_loop( match run_single_session(&endpoint, &sandbox_id, &ssh_socket_path, netns_fd).await { Ok(()) => { - let event = session_closed_event(crate::ocsf_ctx(), &endpoint, &sandbox_id); + let event = + session_closed_event(openshell_ocsf::ctx::ctx(), &endpoint, &sandbox_id); ocsf_emit!(event); break; } Err(e) => { - let event = - session_failed_event(crate::ocsf_ctx(), &endpoint, attempt, &e.to_string()); + let event = session_failed_event( + openshell_ocsf::ctx::ctx(), + &endpoint, + attempt, + &e.to_string(), + ); ocsf_emit!(event); tokio::time::sleep(backoff).await; backoff = (backoff * 2).min(MAX_BACKOFF); @@ -326,7 +331,7 @@ async fn run_single_session( let heartbeat_secs = accepted.heartbeat_interval_secs.max(5); let event = session_established_event( - crate::ocsf_ctx(), + openshell_ocsf::ctx::ctx(), endpoint, &accepted.session_id, heartbeat_secs, @@ -385,20 +390,23 @@ fn handle_gateway_message( let ssh_socket_path = ssh_socket_path.to_path_buf(); let tx = tx.clone(); - let event = relay_open_event(crate::ocsf_ctx(), &relay_open, &ssh_socket_path); + let event = relay_open_event(openshell_ocsf::ctx::ctx(), &relay_open, &ssh_socket_path); ocsf_emit!(event); tokio::spawn(async move { let event_open = relay_open.clone(); match handle_relay_open(relay_open, &ssh_socket_path, netns_fd, channel, tx).await { Ok(()) => { - let event = - relay_closed_event(crate::ocsf_ctx(), &event_open, &ssh_socket_path); + let event = relay_closed_event( + openshell_ocsf::ctx::ctx(), + &event_open, + &ssh_socket_path, + ); ocsf_emit!(event); } Err(e) => { let event = relay_failed_event( - crate::ocsf_ctx(), + openshell_ocsf::ctx::ctx(), &event_open, &ssh_socket_path, &e.to_string(), @@ -415,8 +423,11 @@ fn handle_gateway_message( }); } Some(gateway_message::Payload::RelayClose(close)) => { - let event = - relay_close_from_gateway_event(crate::ocsf_ctx(), &close.channel_id, &close.reason); + let event = relay_close_from_gateway_event( + openshell_ocsf::ctx::ctx(), + &close.channel_id, + &close.reason, + ); ocsf_emit!(event); } _ => { From 8114e8dcbc177dc3161dd3b0c74ef50c2d0a1f8c Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sat, 30 May 2026 22:06:46 +0300 Subject: [PATCH 23/49] refactor(supervisor-process): lift managed_children tracker from openshell-sandbox The MANAGED_CHILDREN set tracks PIDs of supervisor-spawned children (entrypoint + SSH sessions) so the orchestrator's SIGCHLD reaper can distinguish them from incidental zombies. Pure process-side concern, moves to openshell_supervisor_process::managed_children with three public fns: register, unregister, is_managed. Updates lib.rs reaper, process.rs, and ssh.rs to call through the new module path. Drops the now-unused HashSet import from lib.rs. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 40 +------------- crates/openshell-sandbox/src/process.rs | 12 ++--- crates/openshell-sandbox/src/ssh.rs | 12 ++--- .../openshell-supervisor-process/src/lib.rs | 1 + .../src/managed_children.rs | 53 +++++++++++++++++++ 5 files changed, 68 insertions(+), 50 deletions(-) create mode 100644 crates/openshell-supervisor-process/src/managed_children.rs diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 2484d24a3..74f3e3bef 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -14,8 +14,6 @@ mod sandbox; mod ssh; use miette::{IntoDiagnostic, Result}; -#[cfg(target_os = "linux")] -use std::collections::HashSet; use std::future::Future; use std::net::SocketAddr; use std::sync::Arc; @@ -146,41 +144,7 @@ fn route_refresh_interval_secs() -> u64 { } #[cfg(target_os = "linux")] -static MANAGED_CHILDREN: LazyLock>> = - LazyLock::new(|| Mutex::new(HashSet::new())); - -#[cfg(target_os = "linux")] -pub(crate) fn register_managed_child(pid: u32) { - let Ok(pid) = i32::try_from(pid) else { - return; - }; - if pid <= 0 { - return; - } - if let Ok(mut children) = MANAGED_CHILDREN.lock() { - children.insert(pid); - } -} - -#[cfg(target_os = "linux")] -pub(crate) fn unregister_managed_child(pid: u32) { - let Ok(pid) = i32::try_from(pid) else { - return; - }; - if pid <= 0 { - return; - } - if let Ok(mut children) = MANAGED_CHILDREN.lock() { - children.remove(&pid); - } -} - -#[cfg(target_os = "linux")] -fn is_managed_child(pid: i32) -> bool { - MANAGED_CHILDREN - .lock() - .is_ok_and(|children| children.contains(&pid)) -} +use openshell_supervisor_process::managed_children; /// Handles and values produced by [`run_networking`] that the rest of /// `run_sandbox` consumes. @@ -533,7 +497,7 @@ async fn run_process( break; }; - if is_managed_child(pid.as_raw()) { + if managed_children::is_managed(pid.as_raw()) { // Let the explicit waiter own this child status. break; } diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index 2e6f78fd2..bedbe574f 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -6,13 +6,13 @@ use crate::sandbox; #[cfg(target_os = "linux")] use crate::sandbox::linux::netns::NetworkNamespace; -#[cfg(target_os = "linux")] -use crate::{register_managed_child, unregister_managed_child}; use miette::{IntoDiagnostic, Result}; use nix::sys::signal::{self, Signal}; use nix::unistd::{Group, Pid, User}; use openshell_core::policy::{NetworkMode, SandboxPolicy}; use openshell_supervisor_process::child_env; +#[cfg(target_os = "linux")] +use openshell_supervisor_process::managed_children; use std::collections::HashMap; use std::ffi::CString; #[cfg(target_os = "linux")] @@ -321,7 +321,7 @@ impl ProcessHandle { let child = cmd.spawn().into_diagnostic()?; let pid = child.id().unwrap_or(0); - register_managed_child(pid); + managed_children::register(pid); debug!(pid, program, "Process spawned"); @@ -418,7 +418,7 @@ impl ProcessHandle { let child = cmd.spawn().into_diagnostic()?; let pid = child.id().unwrap_or(0); #[cfg(target_os = "linux")] - register_managed_child(pid); + managed_children::register(pid); debug!(pid, program, "Process spawned"); @@ -439,7 +439,7 @@ impl ProcessHandle { pub async fn wait(&mut self) -> std::io::Result { let status = self.child.wait().await; #[cfg(target_os = "linux")] - unregister_managed_child(self.pid); + managed_children::unregister(self.pid); let status = status?; Ok(ProcessStatus::from(status)) } @@ -489,7 +489,7 @@ impl ProcessHandle { impl Drop for ProcessHandle { fn drop(&mut self) { #[cfg(target_os = "linux")] - unregister_managed_child(self.pid); + managed_children::unregister(self.pid); } } diff --git a/crates/openshell-sandbox/src/ssh.rs b/crates/openshell-sandbox/src/ssh.rs index c350f4105..f783b423e 100644 --- a/crates/openshell-sandbox/src/ssh.rs +++ b/crates/openshell-sandbox/src/ssh.rs @@ -5,8 +5,6 @@ use crate::process::drop_privileges; use crate::sandbox; -#[cfg(target_os = "linux")] -use crate::{register_managed_child, unregister_managed_child}; use miette::{IntoDiagnostic, Result}; use nix::pty::{Winsize, openpty}; use nix::unistd::setsid; @@ -16,6 +14,8 @@ use openshell_ocsf::{ ActionId, ActivityId, DispositionId, SeverityId, SshActivityBuilder, StatusId, ocsf_emit, }; use openshell_supervisor_process::child_env; +#[cfg(target_os = "linux")] +use openshell_supervisor_process::managed_children; use rand_core::OsRng; use russh::keys::{Algorithm, PrivateKey}; use russh::server::{Auth, Handle, Session}; @@ -797,7 +797,7 @@ fn spawn_pty_shell( #[cfg(target_os = "linux")] let child_pid = child.id(); #[cfg(target_os = "linux")] - register_managed_child(child_pid); + managed_children::register(child_pid); let master_file = master; let (sender, receiver) = mpsc::channel::>(); @@ -843,7 +843,7 @@ fn spawn_pty_shell( std::thread::spawn(move || { let status = child.wait().ok(); #[cfg(target_os = "linux")] - unregister_managed_child(child_pid); + managed_children::unregister(child_pid); let code = status.and_then(|s| s.code()).unwrap_or(1).unsigned_abs(); // Wait for the reader thread to finish forwarding all output before // sending exit-status and closing the channel. This prevents the @@ -943,7 +943,7 @@ fn spawn_pipe_exec( #[cfg(target_os = "linux")] let child_pid = child.id(); #[cfg(target_os = "linux")] - register_managed_child(child_pid); + managed_children::register(child_pid); let child_stdin = child.stdin.take(); let child_stdout = child.stdout.take().expect("stdout must be piped"); @@ -1015,7 +1015,7 @@ fn spawn_pipe_exec( std::thread::spawn(move || { let status = child.wait().ok(); #[cfg(target_os = "linux")] - unregister_managed_child(child_pid); + managed_children::unregister(child_pid); let code = status.and_then(|s| s.code()).unwrap_or(1).unsigned_abs(); // Wait for both reader threads. let _ = reader_done_rx.recv_timeout(Duration::from_secs(2)); diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs index d7b61c565..06d178800 100644 --- a/crates/openshell-supervisor-process/src/lib.rs +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -12,6 +12,7 @@ pub mod bypass_monitor; pub mod child_env; pub mod debug_rpc; pub mod log_push; +pub mod managed_children; pub mod proposals; pub mod skills; pub mod supervisor_session; diff --git a/crates/openshell-supervisor-process/src/managed_children.rs b/crates/openshell-supervisor-process/src/managed_children.rs new file mode 100644 index 000000000..311c80693 --- /dev/null +++ b/crates/openshell-supervisor-process/src/managed_children.rs @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Process-wide tracker for sandbox-managed child PIDs. +//! +//! The supervisor spawns several long-lived children (the entrypoint, SSH +//! sessions). Each registers its PID here on spawn and removes it on exit so +//! the orchestrator's `SIGCHLD` reaper can distinguish supervised processes +//! from incidental zombies. + +#![cfg(target_os = "linux")] + +use std::collections::HashSet; +use std::sync::{LazyLock, Mutex}; + +static MANAGED_CHILDREN: LazyLock>> = + LazyLock::new(|| Mutex::new(HashSet::new())); + +/// Add `pid` to the supervised-child set. Non-positive or out-of-range values +/// are silently ignored. +pub fn register(pid: u32) { + let Ok(pid) = i32::try_from(pid) else { + return; + }; + if pid <= 0 { + return; + } + if let Ok(mut children) = MANAGED_CHILDREN.lock() { + children.insert(pid); + } +} + +/// Remove `pid` from the supervised-child set. Non-positive or out-of-range +/// values are silently ignored. +pub fn unregister(pid: u32) { + let Ok(pid) = i32::try_from(pid) else { + return; + }; + if pid <= 0 { + return; + } + if let Ok(mut children) = MANAGED_CHILDREN.lock() { + children.remove(&pid); + } +} + +/// Return `true` if `pid` is currently in the supervised-child set. +#[must_use] +pub fn is_managed(pid: i32) -> bool { + MANAGED_CHILDREN + .lock() + .is_ok_and(|children| children.contains(&pid)) +} From 0c62902f57ca1eddf999f95ac13359ae45ae52fc Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sun, 31 May 2026 20:54:57 +0300 Subject: [PATCH 24/49] refactor(supervisor-process): move sandbox hardening from openshell-sandbox Lift the process-only hardening pieces (landlock, seccomp, PreparedSandbox, prepare/enforce, log_sandbox_readiness, top-level apply, and apply_supervisor_startup_hardening) from crates/openshell-sandbox/src/sandbox/ to crates/openshell-supervisor-process/src/sandbox/. Leave netns.rs and nft_ruleset.rs in openshell-sandbox for now, since both eventual leaf crates (supervisor-networking and supervisor-process) read from NetworkNamespace and its final home is decided when run_networking and run_process are extracted. Replace crate::ocsf_ctx() shims in landlock.rs and the new linux/mod.rs with direct openshell_ocsf::ctx::ctx() calls. Update call sites in lib.rs, process.rs, and ssh.rs to import sandbox from openshell_supervisor_process while keeping the netns import unchanged. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 2 +- crates/openshell-sandbox/src/process.rs | 2 +- .../src/sandbox/linux/mod.rs | 163 +---------------- crates/openshell-sandbox/src/sandbox/mod.rs | 58 +----- crates/openshell-sandbox/src/ssh.rs | 14 +- .../openshell-supervisor-process/src/lib.rs | 1 + .../src/sandbox/linux/landlock.rs | 12 +- .../src/sandbox/linux/mod.rs | 166 ++++++++++++++++++ .../src/sandbox/linux/seccomp.rs | 0 .../src/sandbox/mod.rs | 57 ++++++ 10 files changed, 249 insertions(+), 226 deletions(-) rename crates/{openshell-sandbox => openshell-supervisor-process}/src/sandbox/linux/landlock.rs (98%) create mode 100644 crates/openshell-supervisor-process/src/sandbox/linux/mod.rs rename crates/{openshell-sandbox => openshell-supervisor-process}/src/sandbox/linux/seccomp.rs (100%) create mode 100644 crates/openshell-supervisor-process/src/sandbox/mod.rs diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 74f3e3bef..5deb471e7 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -82,9 +82,9 @@ use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPol use openshell_core::provider_credentials::ProviderCredentialState; use openshell_supervisor_networking::identity::BinaryIdentityCache; use openshell_supervisor_networking::mechanistic_mapper; +pub use openshell_supervisor_process::sandbox::apply_supervisor_startup_hardening; use openshell_supervisor_process::skills; pub use process::{ProcessHandle, ProcessStatus}; -pub use sandbox::apply_supervisor_startup_hardening; /// Default interval (seconds) for re-fetching the inference route bundle from /// the gateway in cluster mode. Override at runtime with the diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index bedbe574f..c96724755 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -3,7 +3,6 @@ //! Process management and signal handling. -use crate::sandbox; #[cfg(target_os = "linux")] use crate::sandbox::linux::netns::NetworkNamespace; use miette::{IntoDiagnostic, Result}; @@ -13,6 +12,7 @@ use openshell_core::policy::{NetworkMode, SandboxPolicy}; use openshell_supervisor_process::child_env; #[cfg(target_os = "linux")] use openshell_supervisor_process::managed_children; +use openshell_supervisor_process::sandbox; use std::collections::HashMap; use std::ffi::CString; #[cfg(target_os = "linux")] diff --git a/crates/openshell-sandbox/src/sandbox/linux/mod.rs b/crates/openshell-sandbox/src/sandbox/linux/mod.rs index 487d8b4ad..48a30106b 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/mod.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/mod.rs @@ -1,168 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! Linux sandbox implementation using Landlock and seccomp. +//! Linux-only network namespace helpers. -mod landlock; pub mod netns; mod nft_ruleset; -mod seccomp; - -use miette::Result; -use openshell_core::policy::SandboxPolicy; -use std::path::PathBuf; -use std::sync::Once; - -/// Opaque handle to a prepared-but-not-yet-enforced sandbox. -/// Holds the Landlock ruleset with `PathFds` opened as root. -pub struct PreparedSandbox { - landlock: Option, - policy: SandboxPolicy, -} - -/// Phase 1: Prepare sandbox restrictions **as root** (before `drop_privileges`). -/// -/// Opens Landlock `PathFds` while the process still has root privileges, -/// ensuring paths like mode-700 directories are accessible. -pub fn prepare(policy: &SandboxPolicy, workdir: Option<&str>) -> Result { - let landlock = landlock::prepare(policy, workdir)?; - Ok(PreparedSandbox { - landlock, - policy: policy.clone(), - }) -} - -/// Phase 2: Enforce prepared sandbox restrictions (after `drop_privileges`). -/// -/// Calls `restrict_self()` for Landlock and applies seccomp filters. -/// Neither operation requires root privileges. -pub fn enforce(prepared: PreparedSandbox) -> Result<()> { - if let Some(ruleset) = prepared.landlock { - landlock::enforce(ruleset)?; - } - seccomp::apply(&prepared.policy)?; - Ok(()) -} - -/// Apply the supervisor seccomp prelude after privileged bootstrap completes. -pub fn apply_supervisor_prelude() -> Result<()> { - seccomp::apply_supervisor_prelude() -} - -/// Legacy single-phase apply. Kept for backward compatibility. -/// New callers should use [`prepare`] + [`enforce`] for correct privilege ordering. -#[allow(dead_code)] // Retained for backward compat; live callers use prepare+enforce. -pub fn apply(policy: &SandboxPolicy, workdir: Option<&str>) -> Result<()> { - landlock::apply(policy, workdir)?; - seccomp::apply(policy)?; - Ok(()) -} - -/// Probe Landlock availability and emit OCSF logs from the parent process. -/// -/// This must be called **before** `pre_exec` / `fork()` so that the OCSF events -/// are emitted through the parent's tracing subscriber (the child process after -/// fork does not have a working tracing pipeline). -pub fn log_sandbox_readiness(policy: &SandboxPolicy, workdir: Option<&str>) { - static PROBED: Once = Once::new(); - let mut already_probed = true; - PROBED.call_once(|| already_probed = false); - if already_probed { - return; - } - - let mut read_write = policy.filesystem.read_write.clone(); - let read_only = &policy.filesystem.read_only; - - if policy.filesystem.include_workdir - && let Some(dir) = workdir - { - let workdir_path = PathBuf::from(dir); - if !read_write.contains(&workdir_path) { - read_write.push(workdir_path); - } - } - - let total_paths = read_only.len() + read_write.len(); - - if total_paths == 0 { - openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) - .severity(openshell_ocsf::SeverityId::Informational) - .status(openshell_ocsf::StatusId::Success) - .state(openshell_ocsf::StateId::Other, "skipped") - .message("Landlock filesystem sandbox skipped: no paths configured".to_string()) - .build() - ); - return; - } - - let availability = landlock::probe_availability(); - if let landlock::LandlockAvailability::Available { abi } = &availability { - openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) - .severity(openshell_ocsf::SeverityId::Informational) - .status(openshell_ocsf::StatusId::Success) - .state(openshell_ocsf::StateId::Enabled, "probed") - .message(format!( - "Landlock filesystem sandbox available \ - [abi:v{abi} compat:{:?} ro:{} rw:{}]", - policy.landlock.compatibility, - read_only.len(), - read_write.len(), - )) - .build() - ); - } else { - // Landlock is NOT available — this is the critical log that was - // previously invisible because it only fired inside pre_exec. - let is_best_effort = matches!( - policy.landlock.compatibility, - openshell_core::policy::LandlockCompatibility::BestEffort - ); - let (desc, msg) = if is_best_effort { - ( - format!( - "Sandbox will run WITHOUT filesystem restrictions: {availability}. \ - Policy requests {total_paths} path rule(s) \ - (ro:{} rw:{}) but Landlock cannot enforce them. \ - Set landlock.compatibility to 'hard_requirement' to make this fatal.", - read_only.len(), - read_write.len(), - ), - format!( - "Landlock filesystem sandbox unavailable (best_effort, degraded): {availability}" - ), - ) - } else { - ( - format!( - "Landlock is unavailable: {availability}. \ - Policy requires {total_paths} path rule(s) \ - (ro:{} rw:{}) with hard_requirement — sandbox startup will fail.", - read_only.len(), - read_write.len(), - ), - format!( - "Landlock filesystem sandbox unavailable (hard_requirement, will fail): {availability}" - ), - ) - }; - openshell_ocsf::ocsf_emit!( - openshell_ocsf::DetectionFindingBuilder::new(crate::ocsf_ctx()) - .activity(openshell_ocsf::ActivityId::Open) - .severity(openshell_ocsf::SeverityId::High) - .confidence(openshell_ocsf::ConfidenceId::High) - .is_alert(true) - .finding_info( - openshell_ocsf::FindingInfo::new( - "landlock-unavailable", - "Landlock Filesystem Sandbox Unavailable", - ) - .with_desc(&desc), - ) - .message(msg) - .build() - ); - } -} diff --git a/crates/openshell-sandbox/src/sandbox/mod.rs b/crates/openshell-sandbox/src/sandbox/mod.rs index 067e60ea5..ed9958384 100644 --- a/crates/openshell-sandbox/src/sandbox/mod.rs +++ b/crates/openshell-sandbox/src/sandbox/mod.rs @@ -1,57 +1,13 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! Platform sandboxing implementation. - -use miette::Result; -use openshell_core::policy::SandboxPolicy; +//! Network namespace and bypass-rule helpers retained in the sandbox crate. +//! +//! Hardening (landlock + seccomp + `PreparedSandbox`) lives in +//! `openshell-supervisor-process::sandbox`. The netns piece stays here +//! because both eventual leaf crates (`openshell-supervisor-networking` and +//! `openshell-supervisor-process`) read from it; its final home is decided +//! when `run_networking` and `run_process` are extracted. #[cfg(target_os = "linux")] pub mod linux; - -/// Apply sandboxing rules for the current platform. -/// -/// # Errors -/// -/// Returns an error if the sandbox cannot be applied. -// On Linux the spawn path uses `prepare`+`enforce` directly; this single-phase -// apply is only invoked from the non-Linux spawn_impl. -#[cfg_attr(target_os = "linux", allow(dead_code))] -#[cfg_attr(not(target_os = "linux"), allow(clippy::unnecessary_wraps))] -pub fn apply(policy: &SandboxPolicy, workdir: Option<&str>) -> Result<()> { - #[cfg(target_os = "linux")] - { - linux::apply(policy, workdir) - } - - #[cfg(not(target_os = "linux"))] - { - let _ = (policy, workdir); - openshell_ocsf::ocsf_emit!( - openshell_ocsf::DetectionFindingBuilder::new(crate::ocsf_ctx()) - .activity(openshell_ocsf::ActivityId::Open) - .severity(openshell_ocsf::SeverityId::Medium) - .finding_info(openshell_ocsf::FindingInfo::new( - "platform-sandbox-unavailable", - "Platform Sandboxing Not Implemented", - ).with_desc("Sandbox policy provided but platform sandboxing is not yet implemented on this OS")) - .message("Platform sandboxing not yet implemented") - .build() - ); - Ok(()) - } -} - -/// Apply seccomp hardening for the long-lived supervisor process itself. -#[cfg_attr(not(target_os = "linux"), allow(clippy::unnecessary_wraps))] -pub fn apply_supervisor_startup_hardening() -> Result<()> { - #[cfg(target_os = "linux")] - { - linux::apply_supervisor_prelude() - } - - #[cfg(not(target_os = "linux"))] - { - Ok(()) - } -} diff --git a/crates/openshell-sandbox/src/ssh.rs b/crates/openshell-sandbox/src/ssh.rs index f783b423e..a52393bd1 100644 --- a/crates/openshell-sandbox/src/ssh.rs +++ b/crates/openshell-sandbox/src/ssh.rs @@ -4,7 +4,6 @@ //! Embedded SSH server for sandbox access. use crate::process::drop_privileges; -use crate::sandbox; use miette::{IntoDiagnostic, Result}; use nix::pty::{Winsize, openpty}; use nix::unistd::setsid; @@ -16,6 +15,7 @@ use openshell_ocsf::{ use openshell_supervisor_process::child_env; #[cfg(target_os = "linux")] use openshell_supervisor_process::managed_children; +use openshell_supervisor_process::sandbox; use rand_core::OsRng; use russh::keys::{Algorithm, PrivateKey}; use russh::server::{Auth, Handle, Session}; @@ -1065,7 +1065,8 @@ mod unsafe_pty { _workdir: Option, slave_fd: RawFd, netns_fd: Option, - #[cfg(target_os = "linux")] prepared: crate::sandbox::linux::PreparedSandbox, + #[cfg(target_os = "linux")] + prepared: openshell_supervisor_process::sandbox::linux::PreparedSandbox, ) { // Wrap in Option so we can .take() it out of the FnMut closure. // pre_exec is only called once (after fork, before exec). @@ -1095,7 +1096,8 @@ mod unsafe_pty { policy: SandboxPolicy, _workdir: Option, netns_fd: Option, - #[cfg(target_os = "linux")] prepared: crate::sandbox::linux::PreparedSandbox, + #[cfg(target_os = "linux")] + prepared: openshell_supervisor_process::sandbox::linux::PreparedSandbox, ) { #[cfg(target_os = "linux")] let mut prepared = Some(prepared); @@ -1114,7 +1116,9 @@ mod unsafe_pty { fn enter_netns_and_sandbox( netns_fd: Option, policy: &SandboxPolicy, - #[cfg(target_os = "linux")] prepared: Option, + #[cfg(target_os = "linux")] prepared: Option< + openshell_supervisor_process::sandbox::linux::PreparedSandbox, + >, ) -> std::io::Result<()> { // Enter network namespace before dropping privileges. // This ensures SSH shell processes are isolated to the same @@ -1142,7 +1146,7 @@ mod unsafe_pty { // restrict_self() does not require root. #[cfg(target_os = "linux")] if let Some(prepared) = prepared { - crate::sandbox::linux::enforce(prepared) + openshell_supervisor_process::sandbox::linux::enforce(prepared) .map_err(|err| std::io::Error::other(err.to_string()))?; } diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs index 06d178800..fbbaeea94 100644 --- a/crates/openshell-supervisor-process/src/lib.rs +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -14,5 +14,6 @@ pub mod debug_rpc; pub mod log_push; pub mod managed_children; pub mod proposals; +pub mod sandbox; pub mod skills; pub mod supervisor_session; diff --git a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs b/crates/openshell-supervisor-process/src/sandbox/linux/landlock.rs similarity index 98% rename from crates/openshell-sandbox/src/sandbox/linux/landlock.rs rename to crates/openshell-supervisor-process/src/sandbox/linux/landlock.rs index 95f607c91..8808a1a87 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs +++ b/crates/openshell-supervisor-process/src/sandbox/linux/landlock.rs @@ -128,7 +128,7 @@ pub fn prepare(policy: &SandboxPolicy, workdir: Option<&str>) -> Result { openshell_ocsf::ocsf_emit!( - openshell_ocsf::DetectionFindingBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::DetectionFindingBuilder::new(openshell_ocsf::ctx::ctx()) .activity(openshell_ocsf::ActivityId::Open) .severity(openshell_ocsf::SeverityId::High) .confidence(openshell_ocsf::ConfidenceId::High) @@ -161,7 +161,7 @@ pub fn prepare(policy: &SandboxPolicy, workdir: Option<&str>) -> Result) -> Result) -> Result { if matches!(compatibility, LandlockCompatibility::BestEffort) { openshell_ocsf::ocsf_emit!( - openshell_ocsf::DetectionFindingBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::DetectionFindingBuilder::new(openshell_ocsf::ctx::ctx()) .activity(openshell_ocsf::ActivityId::Open) .severity(openshell_ocsf::SeverityId::High) .confidence(openshell_ocsf::ConfidenceId::High) @@ -278,7 +278,7 @@ pub fn enforce(prepared: PreparedRuleset) -> Result<()> { if let Err(err) = result { if matches!(prepared.compatibility, LandlockCompatibility::BestEffort) { openshell_ocsf::ocsf_emit!( - openshell_ocsf::DetectionFindingBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::DetectionFindingBuilder::new(openshell_ocsf::ctx::ctx()) .activity(openshell_ocsf::ActivityId::Open) .severity(openshell_ocsf::SeverityId::High) .confidence(openshell_ocsf::ConfidenceId::High) @@ -354,7 +354,7 @@ fn try_open_path(path: &Path, compatibility: &LandlockCompatibility) -> Result, + policy: SandboxPolicy, +} + +/// Phase 1: Prepare sandbox restrictions **as root** (before `drop_privileges`). +/// +/// Opens Landlock `PathFds` while the process still has root privileges, +/// ensuring paths like mode-700 directories are accessible. +pub fn prepare(policy: &SandboxPolicy, workdir: Option<&str>) -> Result { + let landlock = landlock::prepare(policy, workdir)?; + Ok(PreparedSandbox { + landlock, + policy: policy.clone(), + }) +} + +/// Phase 2: Enforce prepared sandbox restrictions (after `drop_privileges`). +/// +/// Calls `restrict_self()` for Landlock and applies seccomp filters. +/// Neither operation requires root privileges. +pub fn enforce(prepared: PreparedSandbox) -> Result<()> { + if let Some(ruleset) = prepared.landlock { + landlock::enforce(ruleset)?; + } + seccomp::apply(&prepared.policy)?; + Ok(()) +} + +/// Apply the supervisor seccomp prelude after privileged bootstrap completes. +pub fn apply_supervisor_prelude() -> Result<()> { + seccomp::apply_supervisor_prelude() +} + +/// Legacy single-phase apply. Kept for backward compatibility. +/// New callers should use [`prepare`] + [`enforce`] for correct privilege ordering. +#[allow(dead_code)] // Retained for backward compat; live callers use prepare+enforce. +pub fn apply(policy: &SandboxPolicy, workdir: Option<&str>) -> Result<()> { + landlock::apply(policy, workdir)?; + seccomp::apply(policy)?; + Ok(()) +} + +/// Probe Landlock availability and emit OCSF logs from the parent process. +/// +/// This must be called **before** `pre_exec` / `fork()` so that the OCSF events +/// are emitted through the parent's tracing subscriber (the child process after +/// fork does not have a working tracing pipeline). +pub fn log_sandbox_readiness(policy: &SandboxPolicy, workdir: Option<&str>) { + static PROBED: Once = Once::new(); + let mut already_probed = true; + PROBED.call_once(|| already_probed = false); + if already_probed { + return; + } + + let mut read_write = policy.filesystem.read_write.clone(); + let read_only = &policy.filesystem.read_only; + + if policy.filesystem.include_workdir + && let Some(dir) = workdir + { + let workdir_path = PathBuf::from(dir); + if !read_write.contains(&workdir_path) { + read_write.push(workdir_path); + } + } + + let total_paths = read_only.len() + read_write.len(); + + if total_paths == 0 { + openshell_ocsf::ocsf_emit!( + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) + .severity(openshell_ocsf::SeverityId::Informational) + .status(openshell_ocsf::StatusId::Success) + .state(openshell_ocsf::StateId::Other, "skipped") + .message("Landlock filesystem sandbox skipped: no paths configured".to_string()) + .build() + ); + return; + } + + let availability = landlock::probe_availability(); + if let landlock::LandlockAvailability::Available { abi } = &availability { + openshell_ocsf::ocsf_emit!( + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) + .severity(openshell_ocsf::SeverityId::Informational) + .status(openshell_ocsf::StatusId::Success) + .state(openshell_ocsf::StateId::Enabled, "probed") + .message(format!( + "Landlock filesystem sandbox available \ + [abi:v{abi} compat:{:?} ro:{} rw:{}]", + policy.landlock.compatibility, + read_only.len(), + read_write.len(), + )) + .build() + ); + } else { + // Landlock is NOT available — this is the critical log that was + // previously invisible because it only fired inside pre_exec. + let is_best_effort = matches!( + policy.landlock.compatibility, + openshell_core::policy::LandlockCompatibility::BestEffort + ); + let (desc, msg) = if is_best_effort { + ( + format!( + "Sandbox will run WITHOUT filesystem restrictions: {availability}. \ + Policy requests {total_paths} path rule(s) \ + (ro:{} rw:{}) but Landlock cannot enforce them. \ + Set landlock.compatibility to 'hard_requirement' to make this fatal.", + read_only.len(), + read_write.len(), + ), + format!( + "Landlock filesystem sandbox unavailable (best_effort, degraded): {availability}" + ), + ) + } else { + ( + format!( + "Landlock is unavailable: {availability}. \ + Policy requires {total_paths} path rule(s) \ + (ro:{} rw:{}) with hard_requirement — sandbox startup will fail.", + read_only.len(), + read_write.len(), + ), + format!( + "Landlock filesystem sandbox unavailable (hard_requirement, will fail): {availability}" + ), + ) + }; + openshell_ocsf::ocsf_emit!( + openshell_ocsf::DetectionFindingBuilder::new(openshell_ocsf::ctx::ctx()) + .activity(openshell_ocsf::ActivityId::Open) + .severity(openshell_ocsf::SeverityId::High) + .confidence(openshell_ocsf::ConfidenceId::High) + .is_alert(true) + .finding_info( + openshell_ocsf::FindingInfo::new( + "landlock-unavailable", + "Landlock Filesystem Sandbox Unavailable", + ) + .with_desc(&desc), + ) + .message(msg) + .build() + ); + } +} diff --git a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs b/crates/openshell-supervisor-process/src/sandbox/linux/seccomp.rs similarity index 100% rename from crates/openshell-sandbox/src/sandbox/linux/seccomp.rs rename to crates/openshell-supervisor-process/src/sandbox/linux/seccomp.rs diff --git a/crates/openshell-supervisor-process/src/sandbox/mod.rs b/crates/openshell-supervisor-process/src/sandbox/mod.rs new file mode 100644 index 000000000..ff44f8ba1 --- /dev/null +++ b/crates/openshell-supervisor-process/src/sandbox/mod.rs @@ -0,0 +1,57 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Platform sandboxing implementation. + +use miette::Result; +use openshell_core::policy::SandboxPolicy; + +#[cfg(target_os = "linux")] +pub mod linux; + +/// Apply sandboxing rules for the current platform. +/// +/// # Errors +/// +/// Returns an error if the sandbox cannot be applied. +// On Linux the spawn path uses `prepare`+`enforce` directly; this single-phase +// apply is only invoked from the non-Linux spawn_impl. +#[cfg_attr(target_os = "linux", allow(dead_code))] +#[cfg_attr(not(target_os = "linux"), allow(clippy::unnecessary_wraps))] +pub fn apply(policy: &SandboxPolicy, workdir: Option<&str>) -> Result<()> { + #[cfg(target_os = "linux")] + { + linux::apply(policy, workdir) + } + + #[cfg(not(target_os = "linux"))] + { + let _ = (policy, workdir); + openshell_ocsf::ocsf_emit!( + openshell_ocsf::DetectionFindingBuilder::new(openshell_ocsf::ctx::ctx()) + .activity(openshell_ocsf::ActivityId::Open) + .severity(openshell_ocsf::SeverityId::Medium) + .finding_info(openshell_ocsf::FindingInfo::new( + "platform-sandbox-unavailable", + "Platform Sandboxing Not Implemented", + ).with_desc("Sandbox policy provided but platform sandboxing is not yet implemented on this OS")) + .message("Platform sandboxing not yet implemented") + .build() + ); + Ok(()) + } +} + +/// Apply seccomp hardening for the long-lived supervisor process itself. +#[cfg_attr(not(target_os = "linux"), allow(clippy::unnecessary_wraps))] +pub fn apply_supervisor_startup_hardening() -> Result<()> { + #[cfg(target_os = "linux")] + { + linux::apply_supervisor_prelude() + } + + #[cfg(not(target_os = "linux"))] + { + Ok(()) + } +} From e4f042fb816bfed050a4edd07cb4c995279aa30f Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sun, 31 May 2026 21:09:28 +0300 Subject: [PATCH 25/49] refactor(core): lift proposals flag from openshell-supervisor-process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move proposals.rs (AGENT_PROPOSALS_ENABLED OnceLock + agent_proposals_enabled reader + test_helpers::ProposalsFlagGuard) from openshell-supervisor-process to openshell-core so both eventual leaf crates can read it without depending on each other. The flag is process-wide singleton state initialised once during sandbox startup and read by both the policy.local route (networking-side) and the skills installer (process-side) — same shape as openshell_ocsf::ctx. Move the test-helpers Cargo feature alongside it: openshell-core gains the feature, openshell-supervisor-process loses it, and openshell-sandbox's dev-dependency now enables openshell-core/test-helpers. Update the sandbox re-export shim to point at openshell_core::proposals. Signed-off-by: Radoslav Hubenov --- crates/openshell-core/Cargo.toml | 4 ++++ crates/openshell-core/src/lib.rs | 1 + .../src/proposals.rs | 0 crates/openshell-sandbox/Cargo.toml | 2 +- crates/openshell-sandbox/src/lib.rs | 6 ++---- crates/openshell-supervisor-process/Cargo.toml | 6 ------ crates/openshell-supervisor-process/src/lib.rs | 1 - 7 files changed, 8 insertions(+), 12 deletions(-) rename crates/{openshell-supervisor-process => openshell-core}/src/proposals.rs (100%) diff --git a/crates/openshell-core/Cargo.toml b/crates/openshell-core/Cargo.toml index 4f4acf0b3..92cf8ee82 100644 --- a/crates/openshell-core/Cargo.toml +++ b/crates/openshell-core/Cargo.toml @@ -31,6 +31,10 @@ base64 = { workspace = true } ## Off by default so production builds have an empty registry. ## Enabled by e2e tests and during development. dev-settings = [] +## Expose proposals::test_helpers (`ProposalsFlagGuard`) to downstream test +## code in other crates. Enabled by openshell-sandbox and +## openshell-supervisor-networking dev builds. +test-helpers = [] [build-dependencies] tonic-build = { workspace = true } diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index 19b15e82c..5f1b61792 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -25,6 +25,7 @@ pub mod paths; pub mod policy; pub mod procfs; pub mod progress; +pub mod proposals; pub mod proto; pub mod provider_credentials; pub mod sandbox_env; diff --git a/crates/openshell-supervisor-process/src/proposals.rs b/crates/openshell-core/src/proposals.rs similarity index 100% rename from crates/openshell-supervisor-process/src/proposals.rs rename to crates/openshell-core/src/proposals.rs diff --git a/crates/openshell-sandbox/Cargo.toml b/crates/openshell-sandbox/Cargo.toml index be575d971..4d33ec47c 100644 --- a/crates/openshell-sandbox/Cargo.toml +++ b/crates/openshell-sandbox/Cargo.toml @@ -92,7 +92,7 @@ tempfile = "3" uuid = { version = "1", features = ["v4"] } [dev-dependencies] -openshell-supervisor-process = { path = "../openshell-supervisor-process", features = ["test-helpers"] } +openshell-core = { path = "../openshell-core", features = ["test-helpers"] } tempfile = "3" temp-env = "0.3" tokio-tungstenite = { workspace = true } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 5deb471e7..01da928b4 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -63,12 +63,10 @@ pub(crate) use openshell_ocsf::ctx::ctx as ocsf_ctx; /// to gate the agent-controlled mutation surface. Exposed `pub(crate)` so /// unit tests in sibling modules can flip the flag through a serialized /// guard (see `policy_local::tests::ProposalsFlagGuard`). -pub(crate) use openshell_supervisor_process::proposals::{ - AGENT_PROPOSALS_ENABLED, agent_proposals_enabled, -}; +pub(crate) use openshell_core::proposals::{AGENT_PROPOSALS_ENABLED, agent_proposals_enabled}; #[cfg(test)] -pub(crate) use openshell_supervisor_process::proposals::test_helpers; +pub(crate) use openshell_core::proposals::test_helpers; use crate::l7::tls::{ CertCache, ProxyTlsState, SandboxCa, build_upstream_client_config, read_system_ca_bundle, diff --git a/crates/openshell-supervisor-process/Cargo.toml b/crates/openshell-supervisor-process/Cargo.toml index ede8108b9..9e8d7c8d2 100644 --- a/crates/openshell-supervisor-process/Cargo.toml +++ b/crates/openshell-supervisor-process/Cargo.toml @@ -26,12 +26,6 @@ tracing = { workspace = true } tracing-subscriber = { workspace = true } uuid = { workspace = true } -[features] -## Expose proposals::test_helpers (`ProposalsFlagGuard`) to downstream test -## code in other crates. Enabled by openshell-supervisor-networking and -## openshell-sandbox dev-dependencies. -test-helpers = [] - [dev-dependencies] tempfile = "3" diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs index fbbaeea94..caec45451 100644 --- a/crates/openshell-supervisor-process/src/lib.rs +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -13,7 +13,6 @@ pub mod child_env; pub mod debug_rpc; pub mod log_push; pub mod managed_children; -pub mod proposals; pub mod sandbox; pub mod skills; pub mod supervisor_session; From 0c4127b9d7981107a3286bd671ebf7441243ce20 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sun, 31 May 2026 21:21:32 +0300 Subject: [PATCH 26/49] refactor(core): lift netns + nft_ruleset from openshell-sandbox Move NetworkNamespace and the nft_ruleset bypass-rule generator from crates/openshell-sandbox/src/sandbox/linux/ to crates/openshell-core/src/netns/. Both eventual leaf crates (supervisor-networking and supervisor-process) read from NetworkNamespace, so it must live somewhere both can depend on without violating the Shape A no-leaf-to-leaf rule. Replace crate::ocsf_ctx() shims in netns with direct openshell_ocsf::ctx::ctx() calls, matching the pattern used in already-migrated process modules. Update super::nft_ruleset references inside netns to nft_ruleset since the module is now a sibling sub-module of netns/mod.rs. Add openshell-ocsf and uuid as linux-only dependencies of openshell-core, and gate pub mod netns on target_os = "linux" since the implementation uses netlink, ip(8), and namespace fds. Delete the now-empty sandbox/{mod.rs, linux/mod.rs} stubs and update NetworkNamespace import paths in lib.rs and process.rs to point at openshell_core::netns. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 2 ++ crates/openshell-core/Cargo.toml | 4 +++ crates/openshell-core/src/lib.rs | 2 ++ .../src/netns/mod.rs} | 25 +++++++++---------- .../src/netns}/nft_ruleset.rs | 0 crates/openshell-sandbox/src/lib.rs | 3 +-- crates/openshell-sandbox/src/process.rs | 4 +-- .../src/sandbox/linux/mod.rs | 7 ------ crates/openshell-sandbox/src/sandbox/mod.rs | 13 ---------- 9 files changed, 23 insertions(+), 37 deletions(-) rename crates/{openshell-sandbox/src/sandbox/linux/netns.rs => openshell-core/src/netns/mod.rs} (96%) rename crates/{openshell-sandbox/src/sandbox/linux => openshell-core/src/netns}/nft_ruleset.rs (100%) delete mode 100644 crates/openshell-sandbox/src/sandbox/linux/mod.rs delete mode 100644 crates/openshell-sandbox/src/sandbox/mod.rs diff --git a/Cargo.lock b/Cargo.lock index eb9ec5401..66fbf9198 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3461,6 +3461,7 @@ dependencies = [ "hex", "ipnet", "miette", + "openshell-ocsf", "prost", "prost-types", "protobuf-src", @@ -3474,6 +3475,7 @@ dependencies = [ "tonic-build", "tracing", "url", + "uuid", ] [[package]] diff --git a/crates/openshell-core/Cargo.toml b/crates/openshell-core/Cargo.toml index 92cf8ee82..d0b13ff67 100644 --- a/crates/openshell-core/Cargo.toml +++ b/crates/openshell-core/Cargo.toml @@ -26,6 +26,10 @@ hex = "0.4" sha2 = { workspace = true } base64 = { workspace = true } +[target.'cfg(target_os = "linux")'.dependencies] +openshell-ocsf = { path = "../openshell-ocsf" } +uuid = { workspace = true } + [features] ## Include test-only settings (dummy_bool, dummy_int) in the registry. ## Off by default so production builds have an empty registry. diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index 5f1b61792..b189b60c9 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -21,6 +21,8 @@ pub mod image; pub mod inference; pub mod metadata; pub mod net; +#[cfg(target_os = "linux")] +pub mod netns; pub mod paths; pub mod policy; pub mod procfs; diff --git a/crates/openshell-sandbox/src/sandbox/linux/netns.rs b/crates/openshell-core/src/netns/mod.rs similarity index 96% rename from crates/openshell-sandbox/src/sandbox/linux/netns.rs rename to crates/openshell-core/src/netns/mod.rs index 433f70b1c..bad44ac89 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/netns.rs +++ b/crates/openshell-core/src/netns/mod.rs @@ -7,6 +7,8 @@ //! the sandbox to the host. This ensures the sandboxed process can only //! communicate through the proxy running on the host side of the veth. +mod nft_ruleset; + use miette::{IntoDiagnostic, Result}; use std::net::IpAddr; use std::os::unix::io::RawFd; @@ -71,7 +73,7 @@ impl NetworkNamespace { .unwrap(); openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Informational) .status(openshell_ocsf::StatusId::Success) .state(openshell_ocsf::StateId::Enabled, "creating") @@ -165,7 +167,7 @@ impl NetworkNamespace { }; openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Informational) .status(openshell_ocsf::StatusId::Success) .state(openshell_ocsf::StateId::Enabled, "created") @@ -262,7 +264,7 @@ impl NetworkNamespace { pub fn install_bypass_rules(&self, proxy_port: u16) -> Result<()> { let Some(nft_path) = find_nft() else { openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Medium) .status(openshell_ocsf::StatusId::Failure) .state(openshell_ocsf::StateId::Disabled, "degraded") @@ -287,15 +289,12 @@ impl NetworkNamespace { // before reject rules in the chain so packets are logged before being // rejected. If the kernel lacks nft_log support, fall back to the // reject-only ruleset. - let ruleset_with_log = super::nft_ruleset::generate_bypass_ruleset( - &host_ip_str, - proxy_port, - Some(&log_prefix), - ); + let ruleset_with_log = + nft_ruleset::generate_bypass_ruleset(&host_ip_str, proxy_port, Some(&log_prefix)); if let Err(e) = run_nft_netns(&self.name, &nft_path, &ruleset_with_log) { openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Low) .status(openshell_ocsf::StatusId::Failure) .state(openshell_ocsf::StateId::Other, "degraded") @@ -307,11 +306,11 @@ impl NetworkNamespace { ); let ruleset_no_log = - super::nft_ruleset::generate_bypass_ruleset(&host_ip_str, proxy_port, None); + nft_ruleset::generate_bypass_ruleset(&host_ip_str, proxy_port, None); if let Err(e) = run_nft_netns(&self.name, &nft_path, &ruleset_no_log) { openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Medium) .status(openshell_ocsf::StatusId::Failure) .state(openshell_ocsf::StateId::Disabled, "failed") @@ -326,7 +325,7 @@ impl NetworkNamespace { } openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Informational) .status(openshell_ocsf::StatusId::Success) .state(openshell_ocsf::StateId::Enabled, "installed") @@ -369,7 +368,7 @@ impl Drop for NetworkNamespace { } openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Informational) .status(openshell_ocsf::StatusId::Success) .state(openshell_ocsf::StateId::Disabled, "cleaned_up") diff --git a/crates/openshell-sandbox/src/sandbox/linux/nft_ruleset.rs b/crates/openshell-core/src/netns/nft_ruleset.rs similarity index 100% rename from crates/openshell-sandbox/src/sandbox/linux/nft_ruleset.rs rename to crates/openshell-core/src/netns/nft_ruleset.rs diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 01da928b4..bc6f24c71 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -10,7 +10,6 @@ pub mod opa; mod policy_local; mod process; pub mod proxy; -mod sandbox; mod ssh; use miette::{IntoDiagnostic, Result}; @@ -75,7 +74,7 @@ use crate::l7::tls::{ use crate::opa::OpaEngine; use crate::proxy::ProxyHandle; #[cfg(target_os = "linux")] -use crate::sandbox::linux::netns::NetworkNamespace; +use openshell_core::netns::NetworkNamespace; use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; use openshell_core::provider_credentials::ProviderCredentialState; use openshell_supervisor_networking::identity::BinaryIdentityCache; diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index c96724755..fcea5cf3d 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -3,11 +3,11 @@ //! Process management and signal handling. -#[cfg(target_os = "linux")] -use crate::sandbox::linux::netns::NetworkNamespace; use miette::{IntoDiagnostic, Result}; use nix::sys::signal::{self, Signal}; use nix::unistd::{Group, Pid, User}; +#[cfg(target_os = "linux")] +use openshell_core::netns::NetworkNamespace; use openshell_core::policy::{NetworkMode, SandboxPolicy}; use openshell_supervisor_process::child_env; #[cfg(target_os = "linux")] diff --git a/crates/openshell-sandbox/src/sandbox/linux/mod.rs b/crates/openshell-sandbox/src/sandbox/linux/mod.rs deleted file mode 100644 index 48a30106b..000000000 --- a/crates/openshell-sandbox/src/sandbox/linux/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Linux-only network namespace helpers. - -pub mod netns; -mod nft_ruleset; diff --git a/crates/openshell-sandbox/src/sandbox/mod.rs b/crates/openshell-sandbox/src/sandbox/mod.rs deleted file mode 100644 index ed9958384..000000000 --- a/crates/openshell-sandbox/src/sandbox/mod.rs +++ /dev/null @@ -1,13 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Network namespace and bypass-rule helpers retained in the sandbox crate. -//! -//! Hardening (landlock + seccomp + `PreparedSandbox`) lives in -//! `openshell-supervisor-process::sandbox`. The netns piece stays here -//! because both eventual leaf crates (`openshell-supervisor-networking` and -//! `openshell-supervisor-process`) read from it; its final home is decided -//! when `run_networking` and `run_process` are extracted. - -#[cfg(target_os = "linux")] -pub mod linux; From d0c5b7210fcd21a982aea0bb055442eec5e3ad8c Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sun, 31 May 2026 21:35:18 +0300 Subject: [PATCH 27/49] refactor(supervisor-process): move process.rs and ssh.rs from openshell-sandbox Lift the entrypoint process spawn module and the embedded SSH server module into openshell-supervisor-process. openshell-sandbox now re-exports ProcessHandle/ProcessStatus and calls openshell_supervisor_process::ssh::run_ssh_server directly. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 6 ++++ crates/openshell-sandbox/src/lib.rs | 6 ++-- .../openshell-supervisor-process/Cargo.toml | 8 +++++ .../openshell-supervisor-process/src/lib.rs | 2 ++ .../src/process.rs | 10 +++--- .../src/ssh.rs | 34 ++++++++----------- 6 files changed, 38 insertions(+), 28 deletions(-) rename crates/{openshell-sandbox => openshell-supervisor-process}/src/process.rs (99%) rename crates/{openshell-sandbox => openshell-supervisor-process}/src/ssh.rs (98%) diff --git a/Cargo.lock b/Cargo.lock index 66fbf9198..f43347e44 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3787,11 +3787,17 @@ dependencies = [ name = "openshell-supervisor-process" version = "0.0.0" dependencies = [ + "anyhow", "base64 0.22.1", "hex", + "libc", "miette", + "nix", "openshell-core", "openshell-ocsf", + "rand_core 0.6.4", + "russh", + "rustix 1.1.4", "serde_json", "sha2 0.10.9", "tempfile", diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index bc6f24c71..159af1d0d 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -8,9 +8,7 @@ pub mod l7; pub mod opa; mod policy_local; -mod process; pub mod proxy; -mod ssh; use miette::{IntoDiagnostic, Result}; use std::future::Future; @@ -79,9 +77,9 @@ use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPol use openshell_core::provider_credentials::ProviderCredentialState; use openshell_supervisor_networking::identity::BinaryIdentityCache; use openshell_supervisor_networking::mechanistic_mapper; +pub use openshell_supervisor_process::process::{ProcessHandle, ProcessStatus}; pub use openshell_supervisor_process::sandbox::apply_supervisor_startup_hardening; use openshell_supervisor_process::skills; -pub use process::{ProcessHandle, ProcessStatus}; /// Default interval (seconds) for re-fetching the inference route bundle from /// the gateway in cluster mode. Override at runtime with the @@ -526,7 +524,7 @@ async fn run_process( let (ssh_ready_tx, ssh_ready_rx) = tokio::sync::oneshot::channel(); tokio::spawn(async move { - if let Err(err) = ssh::run_ssh_server( + if let Err(err) = openshell_supervisor_process::ssh::run_ssh_server( listen_path, ssh_ready_tx, policy_clone, diff --git a/crates/openshell-supervisor-process/Cargo.toml b/crates/openshell-supervisor-process/Cargo.toml index 9e8d7c8d2..ec1520e42 100644 --- a/crates/openshell-supervisor-process/Cargo.toml +++ b/crates/openshell-supervisor-process/Cargo.toml @@ -14,9 +14,13 @@ rust-version.workspace = true openshell-core = { path = "../openshell-core" } openshell-ocsf = { path = "../openshell-ocsf" } +anyhow = { workspace = true } base64 = { workspace = true } hex = "0.4" miette = { workspace = true } +nix = { workspace = true } +rand_core = "0.6" +russh = "0.57" serde_json = { workspace = true } sha2 = { workspace = true } tokio = { workspace = true } @@ -26,6 +30,10 @@ tracing = { workspace = true } tracing-subscriber = { workspace = true } uuid = { workspace = true } +[target.'cfg(unix)'.dependencies] +libc = "0.2" +rustix = { workspace = true } + [dev-dependencies] tempfile = "3" diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs index caec45451..cc5ea88be 100644 --- a/crates/openshell-supervisor-process/src/lib.rs +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -13,6 +13,8 @@ pub mod child_env; pub mod debug_rpc; pub mod log_push; pub mod managed_children; +pub mod process; pub mod sandbox; pub mod skills; +pub mod ssh; pub mod supervisor_session; diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-supervisor-process/src/process.rs similarity index 99% rename from crates/openshell-sandbox/src/process.rs rename to crates/openshell-supervisor-process/src/process.rs index fcea5cf3d..2437bcb3b 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-supervisor-process/src/process.rs @@ -3,16 +3,16 @@ //! Process management and signal handling. +use crate::child_env; +#[cfg(target_os = "linux")] +use crate::managed_children; +use crate::sandbox; use miette::{IntoDiagnostic, Result}; use nix::sys::signal::{self, Signal}; use nix::unistd::{Group, Pid, User}; #[cfg(target_os = "linux")] use openshell_core::netns::NetworkNamespace; use openshell_core::policy::{NetworkMode, SandboxPolicy}; -use openshell_supervisor_process::child_env; -#[cfg(target_os = "linux")] -use openshell_supervisor_process::managed_children; -use openshell_supervisor_process::sandbox; use std::collections::HashMap; use std::ffi::CString; #[cfg(target_os = "linux")] @@ -463,7 +463,7 @@ impl ProcessHandle { // First try SIGTERM if let Err(e) = self.signal(Signal::SIGTERM) { openshell_ocsf::ocsf_emit!( - openshell_ocsf::ProcessActivityBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ProcessActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(openshell_ocsf::ActivityId::Close) .severity(openshell_ocsf::SeverityId::Medium) .status(openshell_ocsf::StatusId::Failure) diff --git a/crates/openshell-sandbox/src/ssh.rs b/crates/openshell-supervisor-process/src/ssh.rs similarity index 98% rename from crates/openshell-sandbox/src/ssh.rs rename to crates/openshell-supervisor-process/src/ssh.rs index a52393bd1..366e45def 100644 --- a/crates/openshell-sandbox/src/ssh.rs +++ b/crates/openshell-supervisor-process/src/ssh.rs @@ -3,7 +3,11 @@ //! Embedded SSH server for sandbox access. +use crate::child_env; +#[cfg(target_os = "linux")] +use crate::managed_children; use crate::process::drop_privileges; +use crate::sandbox; use miette::{IntoDiagnostic, Result}; use nix::pty::{Winsize, openpty}; use nix::unistd::setsid; @@ -12,10 +16,6 @@ use openshell_core::provider_credentials::ProviderCredentialState; use openshell_ocsf::{ ActionId, ActivityId, DispositionId, SeverityId, SshActivityBuilder, StatusId, ocsf_emit, }; -use openshell_supervisor_process::child_env; -#[cfg(target_os = "linux")] -use openshell_supervisor_process::managed_children; -use openshell_supervisor_process::sandbox; use rand_core::OsRng; use russh::keys::{Algorithm, PrivateKey}; use russh::server::{Auth, Handle, Session}; @@ -86,7 +86,7 @@ fn ssh_server_init( } ocsf_emit!( - SshActivityBuilder::new(crate::ocsf_ctx()) + SshActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Listen) .severity(SeverityId::Informational) .status(StatusId::Success) @@ -146,7 +146,7 @@ pub async fn run_ssh_server( .await { ocsf_emit!( - SshActivityBuilder::new(crate::ocsf_ctx()) + SshActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -173,7 +173,7 @@ async fn handle_connection( // not by an application-level preface. The supervisor bridges the // gateway's RelayStream directly into this socket. ocsf_emit!( - SshActivityBuilder::new(crate::ocsf_ctx()) + SshActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Allowed) .disposition(DispositionId::Allowed) @@ -292,7 +292,7 @@ impl russh::server::Handler for SshHandler { // uses u32 for ports, but valid TCP ports are 0-65535. Without this // check, port 65537 truncates to port 1 (privileged). if port_to_connect > u32::from(u16::MAX) { - ocsf_emit!(SshActivityBuilder::new(crate::ocsf_ctx()) + ocsf_emit!(SshActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Refuse) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -307,7 +307,7 @@ impl russh::server::Handler for SshHandler { // Only allow forwarding to loopback destinations to prevent the // sandbox SSH server from being used as a generic proxy. if !is_loopback_host(host_to_connect) { - ocsf_emit!(SshActivityBuilder::new(crate::ocsf_ctx()) + ocsf_emit!(SshActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Refuse) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -331,7 +331,7 @@ impl russh::server::Handler for SshHandler { Ok(stream) => stream, Err(err) => { ocsf_emit!( - SshActivityBuilder::new(crate::ocsf_ctx()) + SshActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -465,7 +465,7 @@ impl russh::server::Handler for SshHandler { state.input_sender = Some(input_sender); } else { ocsf_emit!( - SshActivityBuilder::new(crate::ocsf_ctx()) + SshActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Refuse) .action(ActionId::Denied) .disposition(DispositionId::Rejected) @@ -1065,8 +1065,7 @@ mod unsafe_pty { _workdir: Option, slave_fd: RawFd, netns_fd: Option, - #[cfg(target_os = "linux")] - prepared: openshell_supervisor_process::sandbox::linux::PreparedSandbox, + #[cfg(target_os = "linux")] prepared: crate::sandbox::linux::PreparedSandbox, ) { // Wrap in Option so we can .take() it out of the FnMut closure. // pre_exec is only called once (after fork, before exec). @@ -1096,8 +1095,7 @@ mod unsafe_pty { policy: SandboxPolicy, _workdir: Option, netns_fd: Option, - #[cfg(target_os = "linux")] - prepared: openshell_supervisor_process::sandbox::linux::PreparedSandbox, + #[cfg(target_os = "linux")] prepared: crate::sandbox::linux::PreparedSandbox, ) { #[cfg(target_os = "linux")] let mut prepared = Some(prepared); @@ -1116,9 +1114,7 @@ mod unsafe_pty { fn enter_netns_and_sandbox( netns_fd: Option, policy: &SandboxPolicy, - #[cfg(target_os = "linux")] prepared: Option< - openshell_supervisor_process::sandbox::linux::PreparedSandbox, - >, + #[cfg(target_os = "linux")] prepared: Option, ) -> std::io::Result<()> { // Enter network namespace before dropping privileges. // This ensures SSH shell processes are isolated to the same @@ -1146,7 +1142,7 @@ mod unsafe_pty { // restrict_self() does not require root. #[cfg(target_os = "linux")] if let Some(prepared) = prepared { - openshell_supervisor_process::sandbox::linux::enforce(prepared) + crate::sandbox::linux::enforce(prepared) .map_err(|err| std::io::Error::other(err.to_string()))?; } From 436f138effe4a8918afb49fcccd3de7aa15530dc Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sun, 31 May 2026 21:53:08 +0300 Subject: [PATCH 28/49] refactor(supervisor-networking): move proxy, l7, opa, policy_local from openshell-sandbox Lift the egress proxy, L7 enforcement modules, OPA engine, and policy.local advisor API into openshell-supervisor-networking. Move accompanying data files (sandbox-policy.rego), test fixtures (testdata/), and integration tests (system_inference, websocket_upgrade). Sandbox lib.rs now references these via openshell_supervisor_networking::* and ProxyHandle::start_with_bind_addr is exposed as pub for the orchestrator call site. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 27 ++++++ crates/openshell-sandbox/src/lib.rs | 59 ++++++------- .../Cargo.toml | 28 ++++++ .../data/sandbox-policy.rego | 0 .../src/l7/graphql.rs | 0 .../src/l7/inference.rs | 0 .../src/l7/mod.rs | 4 +- .../src/l7/path.rs | 0 .../src/l7/provider.rs | 0 .../src/l7/relay.rs | 16 ++-- .../src/l7/rest.rs | 9 +- .../src/l7/tls.rs | 0 .../src/l7/websocket.rs | 6 +- .../src/lib.rs | 4 + .../src/opa.rs | 4 +- .../src/policy_local.rs | 23 +++-- .../src/proxy.rs | 88 +++++++++---------- .../testdata/sandbox-policy.yaml | 0 .../tests/system_inference.rs | 8 +- .../tests/websocket_upgrade.rs | 4 +- 20 files changed, 172 insertions(+), 108 deletions(-) rename crates/{openshell-sandbox => openshell-supervisor-networking}/data/sandbox-policy.rego (100%) rename crates/{openshell-sandbox => openshell-supervisor-networking}/src/l7/graphql.rs (100%) rename crates/{openshell-sandbox => openshell-supervisor-networking}/src/l7/inference.rs (100%) rename crates/{openshell-sandbox => openshell-supervisor-networking}/src/l7/mod.rs (99%) rename crates/{openshell-sandbox => openshell-supervisor-networking}/src/l7/path.rs (100%) rename crates/{openshell-sandbox => openshell-supervisor-networking}/src/l7/provider.rs (100%) rename crates/{openshell-sandbox => openshell-supervisor-networking}/src/l7/relay.rs (99%) rename crates/{openshell-sandbox => openshell-supervisor-networking}/src/l7/rest.rs (99%) rename crates/{openshell-sandbox => openshell-supervisor-networking}/src/l7/tls.rs (100%) rename crates/{openshell-sandbox => openshell-supervisor-networking}/src/l7/websocket.rs (99%) rename crates/{openshell-sandbox => openshell-supervisor-networking}/src/opa.rs (99%) rename crates/{openshell-sandbox => openshell-supervisor-networking}/src/policy_local.rs (99%) rename crates/{openshell-sandbox => openshell-supervisor-networking}/src/proxy.rs (98%) rename crates/{openshell-sandbox => openshell-supervisor-networking}/testdata/sandbox-policy.yaml (100%) rename crates/{openshell-sandbox => openshell-supervisor-networking}/tests/system_inference.rs (94%) rename crates/{openshell-sandbox => openshell-supervisor-networking}/tests/websocket_upgrade.rs (98%) diff --git a/Cargo.lock b/Cargo.lock index f43347e44..0d416c0f2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3776,11 +3776,38 @@ dependencies = [ name = "openshell-supervisor-networking" version = "0.0.0" dependencies = [ + "apollo-parser", + "base64 0.22.1", + "bytes", + "flate2", + "futures", + "glob", + "hex", + "ipnet", "miette", "openshell-core", + "openshell-ocsf", + "openshell-policy", + "openshell-router", + "rcgen", + "regorus", + "rustls", + "rustls-pemfile", + "serde", + "serde_json", + "serde_yml", + "sha1 0.10.6", + "sha2 0.10.9", + "temp-env", "tempfile", + "thiserror 2.0.18", "tokio", + "tokio-rustls", + "tokio-tungstenite 0.26.2", "tracing", + "url", + "uuid", + "webpki-roots 1.0.7", ] [[package]] diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 159af1d0d..283cf1b89 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -5,11 +5,6 @@ //! //! This crate provides process sandboxing and monitoring capabilities. -pub mod l7; -pub mod opa; -mod policy_local; -pub mod proxy; - use miette::{IntoDiagnostic, Result}; use std::future::Future; use std::net::SocketAddr; @@ -62,21 +57,18 @@ pub(crate) use openshell_ocsf::ctx::ctx as ocsf_ctx; /// guard (see `policy_local::tests::ProposalsFlagGuard`). pub(crate) use openshell_core::proposals::{AGENT_PROPOSALS_ENABLED, agent_proposals_enabled}; -#[cfg(test)] -pub(crate) use openshell_core::proposals::test_helpers; - -use crate::l7::tls::{ - CertCache, ProxyTlsState, SandboxCa, build_upstream_client_config, read_system_ca_bundle, - write_ca_files, -}; -use crate::opa::OpaEngine; -use crate::proxy::ProxyHandle; #[cfg(target_os = "linux")] use openshell_core::netns::NetworkNamespace; use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; use openshell_core::provider_credentials::ProviderCredentialState; use openshell_supervisor_networking::identity::BinaryIdentityCache; +use openshell_supervisor_networking::l7::tls::{ + CertCache, ProxyTlsState, SandboxCa, build_upstream_client_config, read_system_ca_bundle, + write_ca_files, +}; use openshell_supervisor_networking::mechanistic_mapper; +use openshell_supervisor_networking::opa::OpaEngine; +use openshell_supervisor_networking::proxy::ProxyHandle; pub use openshell_supervisor_process::process::{ProcessHandle, ProcessStatus}; pub use openshell_supervisor_process::sandbox::apply_supervisor_startup_hardening; use openshell_supervisor_process::skills; @@ -218,7 +210,7 @@ async fn run_networking( opa_engine: Option<&Arc>, entrypoint_pid: Arc, provider_credentials: &ProviderCredentialState, - policy_local_ctx: &Arc, + policy_local_ctx: &Arc, sandbox_id: Option<&str>, openshell_endpoint: Option<&str>, inference_routes: Option<&str>, @@ -437,7 +429,7 @@ async fn run_process( entrypoint_pid: Arc, provider_credentials: ProviderCredentialState, provider_env: std::collections::HashMap, - policy_local_ctx: Arc, + policy_local_ctx: Arc, ocsf_enabled: Arc, ssh_proxy_url: Option, ssh_netns_fd: Option, @@ -861,11 +853,13 @@ pub async fn run_sandbox( policy_data, ) .await?; - let policy_local_ctx = Arc::new(policy_local::PolicyLocalContext::new( - retained_proto.clone(), - openshell_endpoint.clone(), - sandbox_name_for_agg.clone().or_else(|| sandbox_id.clone()), - )); + let policy_local_ctx = Arc::new( + openshell_supervisor_networking::policy_local::PolicyLocalContext::new( + retained_proto.clone(), + openshell_endpoint.clone(), + sandbox_name_for_agg.clone().or_else(|| sandbox_id.clone()), + ), + ); // Validate that the required "sandbox" user exists in this image. // All sandbox images must include this user for privilege dropping. @@ -1064,7 +1058,7 @@ async fn build_inference_context( sandbox_id: Option<&str>, openshell_endpoint: Option<&str>, inference_routes: Option<&str>, -) -> Result>> { +) -> Result>> { use openshell_router::Router; use openshell_router::config::RouterConfig; @@ -1210,14 +1204,16 @@ async fn build_inference_context( let router = Router::new().map_err(|e| miette::miette!("failed to initialize inference router: {e}"))?; - let patterns = l7::inference::default_patterns(); - - let ctx = Arc::new(proxy::InferenceContext::new( - patterns, - router, - user_routes, - system_routes, - )); + let patterns = openshell_supervisor_networking::l7::inference::default_patterns(); + + let ctx = Arc::new( + openshell_supervisor_networking::proxy::InferenceContext::new( + patterns, + router, + user_routes, + system_routes, + ), + ); // Spawn background route cache refresh for cluster mode at startup so // request handling never depends on control-plane latency. @@ -2390,7 +2386,8 @@ struct PolicyPollLoopContext { interval_secs: u64, ocsf_enabled: Arc, provider_credentials: ProviderCredentialState, - policy_local_ctx: Option>, + policy_local_ctx: + Option>, } async fn run_policy_poll_loop(ctx: PolicyPollLoopContext) -> Result<()> { diff --git a/crates/openshell-supervisor-networking/Cargo.toml b/crates/openshell-supervisor-networking/Cargo.toml index 5b465d6f1..26d4c7ec0 100644 --- a/crates/openshell-supervisor-networking/Cargo.toml +++ b/crates/openshell-supervisor-networking/Cargo.toml @@ -12,13 +12,41 @@ rust-version.workspace = true [dependencies] openshell-core = { path = "../openshell-core" } +openshell-ocsf = { path = "../openshell-ocsf" } +openshell-policy = { path = "../openshell-policy" } +openshell-router = { path = "../openshell-router" } +apollo-parser = { workspace = true } +base64 = { workspace = true } +bytes = { workspace = true } +flate2 = "1" +glob = { workspace = true } +hex = "0.4" +ipnet = "2" miette = { workspace = true } +rcgen = { workspace = true } +regorus = { version = "0.9", default-features = false, features = ["std", "arc", "glob", "yaml"] } +rustls = { workspace = true } +rustls-pemfile = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +serde_yml = { workspace = true } +sha1 = "0.10" +sha2 = { workspace = true } +thiserror = { workspace = true } tokio = { workspace = true } +tokio-rustls = { workspace = true } tracing = { workspace = true } +url = { workspace = true } +uuid = { workspace = true } +webpki-roots = { workspace = true } [dev-dependencies] +openshell-core = { path = "../openshell-core", features = ["test-helpers"] } tempfile = "3" +temp-env = "0.3" +tokio-tungstenite = { workspace = true } +futures = { workspace = true } [lints] workspace = true diff --git a/crates/openshell-sandbox/data/sandbox-policy.rego b/crates/openshell-supervisor-networking/data/sandbox-policy.rego similarity index 100% rename from crates/openshell-sandbox/data/sandbox-policy.rego rename to crates/openshell-supervisor-networking/data/sandbox-policy.rego diff --git a/crates/openshell-sandbox/src/l7/graphql.rs b/crates/openshell-supervisor-networking/src/l7/graphql.rs similarity index 100% rename from crates/openshell-sandbox/src/l7/graphql.rs rename to crates/openshell-supervisor-networking/src/l7/graphql.rs diff --git a/crates/openshell-sandbox/src/l7/inference.rs b/crates/openshell-supervisor-networking/src/l7/inference.rs similarity index 100% rename from crates/openshell-sandbox/src/l7/inference.rs rename to crates/openshell-supervisor-networking/src/l7/inference.rs diff --git a/crates/openshell-sandbox/src/l7/mod.rs b/crates/openshell-supervisor-networking/src/l7/mod.rs similarity index 99% rename from crates/openshell-sandbox/src/l7/mod.rs rename to crates/openshell-supervisor-networking/src/l7/mod.rs index 703aafae4..31c2071e0 100644 --- a/crates/openshell-sandbox/src/l7/mod.rs +++ b/crates/openshell-supervisor-networking/src/l7/mod.rs @@ -122,7 +122,7 @@ pub fn parse_l7_config(val: ®orus::Value) -> Option { let tls = match get_object_str(val, "tls").as_deref() { Some("skip") => TlsMode::Skip, Some("terminate") => { - let event = openshell_ocsf::NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = openshell_ocsf::NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(openshell_ocsf::ActivityId::Other) .severity(openshell_ocsf::SeverityId::Medium) .message( @@ -134,7 +134,7 @@ pub fn parse_l7_config(val: ®orus::Value) -> Option { TlsMode::Auto } Some("passthrough") => { - let event = openshell_ocsf::NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = openshell_ocsf::NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(openshell_ocsf::ActivityId::Other) .severity(openshell_ocsf::SeverityId::Medium) .message( diff --git a/crates/openshell-sandbox/src/l7/path.rs b/crates/openshell-supervisor-networking/src/l7/path.rs similarity index 100% rename from crates/openshell-sandbox/src/l7/path.rs rename to crates/openshell-supervisor-networking/src/l7/path.rs diff --git a/crates/openshell-sandbox/src/l7/provider.rs b/crates/openshell-supervisor-networking/src/l7/provider.rs similarity index 100% rename from crates/openshell-sandbox/src/l7/provider.rs rename to crates/openshell-supervisor-networking/src/l7/provider.rs diff --git a/crates/openshell-sandbox/src/l7/relay.rs b/crates/openshell-supervisor-networking/src/l7/relay.rs similarity index 99% rename from crates/openshell-sandbox/src/l7/relay.rs rename to crates/openshell-supervisor-networking/src/l7/relay.rs index 21e3133c2..a20769493 100644 --- a/crates/openshell-sandbox/src/l7/relay.rs +++ b/crates/openshell-supervisor-networking/src/l7/relay.rs @@ -104,7 +104,7 @@ fn emit_parse_rejection(ctx: &L7EvalContext, detail: &str, engine_type: &str) { } else { &ctx.policy_name }; - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -150,7 +150,7 @@ where } // SQL provider is Phase 3 — fall through to passthrough with warning { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .severity(SeverityId::Low) .dst_endpoint(Endpoint::from_domain(&ctx.host, ctx.port)) @@ -431,7 +431,7 @@ fn emit_l7_request_log( let summary = graphql_info .map(|info| format!(" {}", graphql_log_summary(info))) .unwrap_or_default(); - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(action_id) .disposition(disposition_id) @@ -477,7 +477,7 @@ where "raw bidirectional relay (L7 enforcement no longer active)" }; ocsf_emit!( - NetworkActivityBuilder::new(crate::ocsf_ctx()) + NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .activity_name("Upgrade") .severity(SeverityId::Informational) @@ -735,7 +735,7 @@ where SeverityId::Informational, ), }; - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(action_id) .disposition(disposition_id) @@ -828,7 +828,7 @@ fn close_if_stale(guard: &PolicyGenerationGuard, ctx: &L7EvalContext) -> bool { } ocsf_emit!( - NetworkActivityBuilder::new(crate::ocsf_ctx()) + NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -968,7 +968,7 @@ where ), }; let gql_summary = graphql_log_summary(&graphql_info); - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(action_id) .disposition(disposition_id) @@ -1229,7 +1229,7 @@ where // Uses redacted_target (path only, no query params) to avoid logging secrets. let has_creds = resolver.is_some(); { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Allowed) .disposition(DispositionId::Allowed) diff --git a/crates/openshell-sandbox/src/l7/rest.rs b/crates/openshell-supervisor-networking/src/l7/rest.rs similarity index 99% rename from crates/openshell-sandbox/src/l7/rest.rs rename to crates/openshell-supervisor-networking/src/l7/rest.rs index 2216a378e..4f46d24ba 100644 --- a/crates/openshell-sandbox/src/l7/rest.rs +++ b/crates/openshell-supervisor-networking/src/l7/rest.rs @@ -2285,7 +2285,8 @@ mod tests { #[test] fn deny_response_body_is_agent_readable_and_redacted() { // Agent-readable next_steps is gated on the proposals feature flag. - let _proposals = crate::test_helpers::ProposalsFlagGuard::set_blocking(true); + let _proposals = + openshell_core::proposals::test_helpers::ProposalsFlagGuard::set_blocking(true); let req = L7Request { action: "PUT".to_string(), target: "/repos/NVIDIA/OpenShell/contents/README.md?access_token=secret-token" @@ -2350,7 +2351,8 @@ mod tests { #[test] fn deny_response_body_omits_agent_guidance_when_policy_advisor_is_off() { - let _proposals = crate::test_helpers::ProposalsFlagGuard::set_blocking(false); + let _proposals = + openshell_core::proposals::test_helpers::ProposalsFlagGuard::set_blocking(false); let req = L7Request { action: "GET".to_string(), target: "/gists".to_string(), @@ -2382,7 +2384,8 @@ mod tests { #[tokio::test] async fn send_deny_response_writes_structured_json_403() { // Agent-readable next_steps is gated on the proposals feature flag. - let _proposals = crate::test_helpers::ProposalsFlagGuard::set(true).await; + let _proposals = + openshell_core::proposals::test_helpers::ProposalsFlagGuard::set(true).await; let (mut client, mut server) = tokio::io::duplex(4096); let send = tokio::spawn(async move { let req = L7Request { diff --git a/crates/openshell-sandbox/src/l7/tls.rs b/crates/openshell-supervisor-networking/src/l7/tls.rs similarity index 100% rename from crates/openshell-sandbox/src/l7/tls.rs rename to crates/openshell-supervisor-networking/src/l7/tls.rs diff --git a/crates/openshell-sandbox/src/l7/websocket.rs b/crates/openshell-supervisor-networking/src/l7/websocket.rs similarity index 99% rename from crates/openshell-sandbox/src/l7/websocket.rs rename to crates/openshell-supervisor-networking/src/l7/websocket.rs index 876d86d00..c965aacf5 100644 --- a/crates/openshell-sandbox/src/l7/websocket.rs +++ b/crates/openshell-supervisor-networking/src/l7/websocket.rs @@ -954,7 +954,7 @@ fn emit_rewrite_event(host: &str, port: u16, policy_name: &str, replacements: us } else { policy_name }; - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Allowed) .disposition(DispositionId::Allowed) @@ -1001,7 +1001,7 @@ fn emit_websocket_l7_event( ), }; let summary = graphql.map(graphql_log_summary).unwrap_or_default(); - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(action_id) .disposition(disposition_id) @@ -1082,7 +1082,7 @@ fn emit_protocol_failure(host: &str, port: u16, policy_name: &str, failure_class } else { policy_name }; - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) diff --git a/crates/openshell-supervisor-networking/src/lib.rs b/crates/openshell-supervisor-networking/src/lib.rs index 2c9d649a8..fd81e83f3 100644 --- a/crates/openshell-supervisor-networking/src/lib.rs +++ b/crates/openshell-supervisor-networking/src/lib.rs @@ -9,4 +9,8 @@ pub mod denial_aggregator; pub mod identity; +pub mod l7; pub mod mechanistic_mapper; +pub mod opa; +pub mod policy_local; +pub mod proxy; diff --git a/crates/openshell-sandbox/src/opa.rs b/crates/openshell-supervisor-networking/src/opa.rs similarity index 99% rename from crates/openshell-sandbox/src/opa.rs rename to crates/openshell-supervisor-networking/src/opa.rs index ecea499e0..4dd0350ff 100644 --- a/crates/openshell-sandbox/src/opa.rs +++ b/crates/openshell-supervisor-networking/src/opa.rs @@ -202,7 +202,7 @@ impl OpaEngine { let (errors, warnings) = crate::l7::validate_l7_policies(&data); for w in &warnings { openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Medium) .status(openshell_ocsf::StatusId::Success) .state(openshell_ocsf::StateId::Enabled, "validated") @@ -728,7 +728,7 @@ fn preprocess_yaml_data(yaml_str: &str) -> Result { let (errors, warnings) = crate::l7::validate_l7_policies(&data); for w in &warnings { openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Medium) .status(openshell_ocsf::StatusId::Success) .state(openshell_ocsf::StateId::Enabled, "validated") diff --git a/crates/openshell-sandbox/src/policy_local.rs b/crates/openshell-supervisor-networking/src/policy_local.rs similarity index 99% rename from crates/openshell-sandbox/src/policy_local.rs rename to crates/openshell-supervisor-networking/src/policy_local.rs index 9d570dfd0..b0ff271f6 100644 --- a/crates/openshell-sandbox/src/policy_local.rs +++ b/crates/openshell-supervisor-networking/src/policy_local.rs @@ -19,6 +19,7 @@ use tokio::sync::RwLock; pub const POLICY_LOCAL_HOST: &str = "policy.local"; /// Filesystem path of the static agent guidance bundle inside the sandbox. +/// /// Single source of truth: the skill installer writes here, the L7 deny body /// references this path in `next_steps`, and the skill's own documentation /// renders the same path. Changing the location is a one-line update here. @@ -28,7 +29,9 @@ pub const SKILL_PATH: &str = "/etc/openshell/skills/policy_advisor.md"; /// instructions than structured next-step JSON alone. pub const AGENT_GUIDANCE: &str = "OpenShell blocked this request with sandbox policy. If the user's task still needs this network action, do not stop here: read /etc/openshell/skills/policy_advisor.md, submit the narrowest policy proposal to http://policy.local/v1/proposals, wait for approval and `policy_reloaded: true`, then retry the original request."; -/// Routes served by the in-sandbox policy advisor API. Held in one place so +/// Routes served by the in-sandbox policy advisor API. +/// +/// Held in one place so /// the L7 deny `next_steps` array, the route dispatcher, the skill content, /// and tests all stay in sync — change the wire path here and every caller /// follows. See `agent_next_steps()` for the consumer that surfaces these @@ -146,7 +149,7 @@ async fn route_request( // when the flag is off — including the diagnostic `current_policy` and // `denials` routes. The skill is also not installed in that mode, so a // disabled sandbox has no entry point into this API at all. - if !crate::agent_proposals_enabled() { + if !openshell_core::proposals::agent_proposals_enabled() { return ( 404, serde_json::json!({ @@ -209,7 +212,9 @@ fn not_found_payload(path: &str) -> (u16, serde_json::Value) { } /// Build the `next_steps` array embedded in the L7 deny body so the agent has -/// machine-readable pointers to this API. Centralizes the shape here to keep +/// machine-readable pointers to this API. +/// +/// Centralizes the shape here to keep /// the deny body and the actual route table from drifting — adding or /// renaming a route only requires touching the route constants above. /// @@ -218,7 +223,7 @@ fn not_found_payload(path: &str) -> (u16, serde_json::Value) { /// caller still emits the field (with `[]`) so the wire shape is stable. #[must_use] pub fn agent_next_steps() -> serde_json::Value { - if !crate::agent_proposals_enabled() { + if !openshell_core::proposals::agent_proposals_enabled() { return serde_json::json!([]); } let host = POLICY_LOCAL_HOST; @@ -249,7 +254,7 @@ pub fn agent_next_steps() -> serde_json::Value { /// Build the optional natural-language guidance embedded in L7 deny bodies. #[must_use] pub fn agent_guidance() -> Option<&'static str> { - crate::agent_proposals_enabled().then_some(AGENT_GUIDANCE) + openshell_core::proposals::agent_proposals_enabled().then_some(AGENT_GUIDANCE) } async fn current_policy_response(ctx: &PolicyLocalContext) -> (u16, serde_json::Value) { @@ -558,7 +563,7 @@ async fn submit_proposal(ctx: &PolicyLocalContext, body: &[u8]) -> (u16, serde_j /// to see in the audit trace to correlate against the inbox card. fn emit_policy_propose_event(chunk_id: &str, summary: &str) { ocsf_emit!( - ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(SeverityId::Informational) .status(StatusId::Success) .state(StateId::Other, "PROPOSED") @@ -578,7 +583,7 @@ fn emit_policy_decision_event(chunk: &PolicyChunk) { let summary = summarize_chunk_for_audit(chunk); match chunk.status.as_str() { "approved" => ocsf_emit!( - ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(SeverityId::Informational) .status(StatusId::Success) .state(StateId::Enabled, "APPROVED") @@ -600,7 +605,7 @@ fn emit_policy_decision_event(chunk: &PolicyChunk) { format!("\"{sanitized}\"") }; ocsf_emit!( - ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(SeverityId::Low) .status(StatusId::Success) .state(StateId::Disabled, "REJECTED") @@ -1568,7 +1573,7 @@ mod tests { assert!(surfaced.ends_with("...[truncated]")); } - use crate::test_helpers::ProposalsFlagGuard; + use openshell_core::proposals::test_helpers::ProposalsFlagGuard; #[test] fn agent_next_steps_returns_empty_when_flag_off() { diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-supervisor-networking/src/proxy.rs similarity index 98% rename from crates/openshell-sandbox/src/proxy.rs rename to crates/openshell-supervisor-networking/src/proxy.rs index 15a549874..499386879 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-supervisor-networking/src/proxy.rs @@ -3,6 +3,7 @@ //! HTTP CONNECT proxy with OPA policy evaluation and process-identity binding. +use crate::identity::BinaryIdentityCache; use crate::l7::tls::ProxyTlsState; use crate::opa::{NetworkAction, OpaEngine, PolicyGenerationGuard}; use crate::policy_local::{POLICY_LOCAL_HOST, PolicyLocalContext}; @@ -16,7 +17,6 @@ use openshell_ocsf::{ ActionId, ActivityId, DispositionId, Endpoint, HttpActivityBuilder, HttpRequest, NetworkActivityBuilder, Process, SeverityId, StatusId, Url as OcsfUrl, ocsf_emit, }; -use openshell_supervisor_networking::identity::BinaryIdentityCache; use std::net::{IpAddr, SocketAddr}; use std::path::PathBuf; use std::sync::Arc; @@ -175,7 +175,7 @@ impl ProxyHandle { /// The proxy uses OPA for network decisions with process-identity binding /// via `/proc/net/tcp`. All connections are evaluated through OPA policy. #[allow(clippy::too_many_arguments)] - pub(crate) async fn start_with_bind_addr( + pub async fn start_with_bind_addr( policy: &ProxyPolicy, bind_addr: Option, opa_engine: Arc, @@ -204,7 +204,7 @@ impl ProxyHandle { let listener = TcpListener::bind(http_addr).await.into_diagnostic()?; let local_addr = listener.local_addr().into_diagnostic()?; { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Listen) .severity(SeverityId::Informational) .status(StatusId::Success) @@ -256,7 +256,7 @@ impl ProxyHandle { ) .await { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -267,7 +267,7 @@ impl ProxyHandle { }); } Err(err) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -435,7 +435,7 @@ async fn handle_tcp_connection( ) .await?; if let InferenceOutcome::Denied { reason } = outcome { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -513,7 +513,7 @@ async fn handle_tcp_connection( // Allowed connections are logged after the L7 config check (below) // so we can distinguish CONNECT (L4-only) from CONNECT_L7 (L7 follows). if matches!(decision.action, NetworkAction::Deny { .. }) { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -587,7 +587,7 @@ async fn handle_tcp_connection( .into_diagnostic()?, Err(reason) => { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -642,7 +642,7 @@ async fn handle_tcp_connection( .into_diagnostic()?, Err(reason) => { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -689,7 +689,7 @@ async fn handle_tcp_connection( } Err(reason) => { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -742,7 +742,7 @@ async fn handle_tcp_connection( .into_diagnostic()?, Err(reason) => { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -794,7 +794,7 @@ async fn handle_tcp_connection( .into_diagnostic()?, Err(reason) => { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -860,7 +860,7 @@ async fn handle_tcp_connection( "CONNECT" }; { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Allowed) .disposition(DispositionId::Allowed) @@ -1002,7 +1002,7 @@ async fn handle_tcp_connection( "TLS connection closed" ); } else { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -1014,7 +1014,7 @@ async fn handle_tcp_connection( } } else { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -1067,7 +1067,7 @@ async fn handle_tcp_connection( if is_benign_relay_error(&e) { debug!(host = %host_lc, port = port, error = %e, "L7 connection closed"); } else { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -1100,7 +1100,7 @@ async fn handle_tcp_connection( if is_benign_relay_error(&e) { debug!(host = %host_lc, port = port, error = %e, "HTTP relay closed"); } else { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -1550,7 +1550,7 @@ async fn process_inference_keepalive { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Refuse) .action(ActionId::Denied) .disposition(DispositionId::Rejected) @@ -1589,7 +1589,7 @@ async fn route_inference_request( detect_inference_pattern(&request.method, &normalized_path, &ctx.patterns) { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Allowed) .disposition(DispositionId::Detected) @@ -1677,7 +1677,7 @@ async fn route_inference_request( } Ok(Ok(None)) => break, Ok(Err(e)) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Medium) .status(StatusId::Failure) @@ -1693,7 +1693,7 @@ async fn route_inference_request( break; } Err(_) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Medium) .status(StatusId::Failure) @@ -1717,7 +1717,7 @@ async fn route_inference_request( } Err(e) => { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -1743,7 +1743,7 @@ async fn route_inference_request( } else { // Not an inference request — deny { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -1836,7 +1836,7 @@ struct L7RouteSnapshot { } fn emit_l7_tunnel_close_after_policy_change(host: &str, port: u16, error: miette::Report) { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -1888,7 +1888,7 @@ fn query_l7_route_snapshot( generation, }), Err(e) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -2429,7 +2429,7 @@ fn parse_allowed_ips(raw: &[String]) -> std::result::Result, S } if n.prefix_len() < MIN_SAFE_PREFIX_LEN { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .severity(SeverityId::Medium) .message(format!( @@ -2481,7 +2481,7 @@ fn query_allowed_ips( match engine.query_allowed_ips(&input) { Ok(ips) => ips, Err(e) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -2523,7 +2523,7 @@ fn query_exact_declared_endpoint_host( match engine.query_exact_declared_endpoint_host(&input) { Ok(is_exact_declared) => is_exact_declared, Err(e) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -2857,7 +2857,7 @@ async fn handle_forward_proxy( let (scheme, host, port, mut path) = match parse_proxy_uri(target_uri) { Ok(parsed) => parsed, Err(e) => { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -2905,7 +2905,7 @@ async fn handle_forward_proxy( // 2. Reject HTTPS — must use CONNECT for TLS if scheme == "https" { { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Refuse) .action(ActionId::Denied) .disposition(DispositionId::Rejected) @@ -2981,7 +2981,7 @@ async fn handle_forward_proxy( NetworkAction::Allow { matched_policy } => matched_policy.clone(), NetworkAction::Deny { reason } => { { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -3153,7 +3153,7 @@ async fn handle_forward_proxy( params } Err(e) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Medium) .status(StatusId::Failure) @@ -3221,7 +3221,7 @@ async fn handle_forward_proxy( { Ok(info) => info, Err(e) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Medium) .status(StatusId::Failure) @@ -3264,7 +3264,7 @@ async fn handle_forward_proxy( || { crate::l7::relay::evaluate_l7_request(&tunnel_engine, &l7_ctx, &request_info) .unwrap_or_else(|e| { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -3309,7 +3309,7 @@ async fn handle_forward_proxy( } else { "FORWARD_L7" }; - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(action_id) .disposition(disposition_id) @@ -3388,7 +3388,7 @@ async fn handle_forward_proxy( Ok(addrs) => addrs, Err(reason) => { { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -3444,7 +3444,7 @@ async fn handle_forward_proxy( Ok(addrs) => addrs, Err(reason) => { { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -3495,7 +3495,7 @@ async fn handle_forward_proxy( } Err(reason) => { { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -3551,7 +3551,7 @@ async fn handle_forward_proxy( Ok(addrs) => addrs, Err(reason) => { { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -3605,7 +3605,7 @@ async fn handle_forward_proxy( Ok(addrs) => addrs, Err(reason) => { { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -3672,7 +3672,7 @@ async fn handle_forward_proxy( let mut upstream = match TcpStream::connect(addrs.as_slice()).await { Ok(s) => s, Err(e) => { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -3707,7 +3707,7 @@ async fn handle_forward_proxy( // Log success { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Allowed) .disposition(DispositionId::Allowed) @@ -6651,7 +6651,7 @@ network_policies: #[cfg(target_os = "linux")] #[test] fn resolve_process_identity_surfaces_binary_integrity_violation_on_hot_swap() { - use openshell_supervisor_networking::identity::BinaryIdentityCache; + use crate::identity::BinaryIdentityCache; use std::io::Read; use std::net::TcpListener; use std::os::unix::fs::PermissionsExt; @@ -6783,7 +6783,7 @@ network_policies: // SELinux-enforcing hosts. Fix by building a test-sleep-helper binary in // the same crate so it inherits the user_home_t label. fn resolve_process_identity_denies_fork_exec_shared_socket_ambiguity() { - use openshell_supervisor_networking::identity::BinaryIdentityCache; + use crate::identity::BinaryIdentityCache; use std::ffi::CString; use std::net::{TcpListener, TcpStream}; use std::os::fd::AsRawFd; diff --git a/crates/openshell-sandbox/testdata/sandbox-policy.yaml b/crates/openshell-supervisor-networking/testdata/sandbox-policy.yaml similarity index 100% rename from crates/openshell-sandbox/testdata/sandbox-policy.yaml rename to crates/openshell-supervisor-networking/testdata/sandbox-policy.yaml diff --git a/crates/openshell-sandbox/tests/system_inference.rs b/crates/openshell-supervisor-networking/tests/system_inference.rs similarity index 94% rename from crates/openshell-sandbox/tests/system_inference.rs rename to crates/openshell-supervisor-networking/tests/system_inference.rs index 20c39f3b6..324240c0a 100644 --- a/crates/openshell-sandbox/tests/system_inference.rs +++ b/crates/openshell-supervisor-networking/tests/system_inference.rs @@ -9,7 +9,7 @@ use openshell_router::Router; use openshell_router::config::{AuthHeader, ResolvedRoute}; -use openshell_sandbox::proxy::InferenceContext; +use openshell_supervisor_networking::proxy::InferenceContext; fn make_system_route() -> ResolvedRoute { ResolvedRoute { @@ -42,7 +42,7 @@ fn make_user_route() -> ResolvedRoute { #[tokio::test] async fn system_inference_routes_to_mock_backend() { let router = Router::new().unwrap(); - let patterns = openshell_sandbox::l7::inference::default_patterns(); + let patterns = openshell_supervisor_networking::l7::inference::default_patterns(); let ctx = InferenceContext::new( patterns, @@ -86,7 +86,7 @@ async fn system_inference_routes_to_mock_backend() { #[tokio::test] async fn system_inference_uses_system_routes_not_user_routes() { let router = Router::new().unwrap(); - let patterns = openshell_sandbox::l7::inference::default_patterns(); + let patterns = openshell_supervisor_networking::l7::inference::default_patterns(); // Only user routes configured — no system routes let ctx = InferenceContext::new(patterns, router, vec![make_user_route()], vec![]); @@ -118,7 +118,7 @@ async fn system_inference_uses_system_routes_not_user_routes() { #[tokio::test] async fn system_inference_with_anthropic_protocol() { let router = Router::new().unwrap(); - let patterns = openshell_sandbox::l7::inference::default_patterns(); + let patterns = openshell_supervisor_networking::l7::inference::default_patterns(); let system_route = ResolvedRoute { name: "sandbox-system".to_string(), diff --git a/crates/openshell-sandbox/tests/websocket_upgrade.rs b/crates/openshell-supervisor-networking/tests/websocket_upgrade.rs similarity index 98% rename from crates/openshell-sandbox/tests/websocket_upgrade.rs rename to crates/openshell-supervisor-networking/tests/websocket_upgrade.rs index b35076a9a..3339a42a7 100644 --- a/crates/openshell-sandbox/tests/websocket_upgrade.rs +++ b/crates/openshell-supervisor-networking/tests/websocket_upgrade.rs @@ -26,8 +26,8 @@ use futures::SinkExt; use futures::stream::StreamExt; -use openshell_sandbox::l7::provider::{BodyLength, L7Provider, L7Request, RelayOutcome}; -use openshell_sandbox::l7::rest::RestProvider; +use openshell_supervisor_networking::l7::provider::{BodyLength, L7Provider, L7Request, RelayOutcome}; +use openshell_supervisor_networking::l7::rest::RestProvider; use std::collections::HashMap; use std::net::SocketAddr; use tokio::io::{AsyncReadExt, AsyncWriteExt}; From bec10f3da5b8bd7934b862000b4263d308b3b284 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sun, 31 May 2026 22:49:37 +0300 Subject: [PATCH 29/49] refactor(sandbox): hoist policy poll loop and denial aggregator into orchestrator Move the symlink-resolver, policy poll loop, and denial-aggregator flush spawns out of run_process and into run_sandbox so run_process no longer needs OpaEngine, retained_proto, the local policy context, the sandbox name, the gateway endpoint for telemetry, the OCSF flag, or the denial receiver. These long-running orchestrator-owned tasks now live alongside the other sandbox-startup wiring, matching the design log decision in architecture/plans/sandbox-split-design-choices.md (Q5). Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 281 +++++++++--------- .../tests/websocket_upgrade.rs | 4 +- 2 files changed, 140 insertions(+), 145 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 283cf1b89..d496ee2e8 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -420,22 +420,16 @@ async fn run_process( timeout_secs: u64, interactive: bool, sandbox_id: Option<&str>, - sandbox_name_for_agg: Option<&str>, openshell_endpoint: Option<&str>, ssh_socket_path: Option, policy: &SandboxPolicy, - opa_engine: Option<&Arc>, - retained_proto: Option<&openshell_core::proto::SandboxPolicy>, entrypoint_pid: Arc, provider_credentials: ProviderCredentialState, provider_env: std::collections::HashMap, - policy_local_ctx: Arc, - ocsf_enabled: Arc, ssh_proxy_url: Option, ssh_netns_fd: Option, ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, #[cfg(target_os = "linux")] netns: Option<&NetworkNamespace>, - denial_rx: Option>, ) -> Result { // Zombie reaper — openshell-sandbox may run as PID 1 in containers and // must reap orphaned grandchildren (e.g. background daemons started by @@ -622,138 +616,6 @@ async fn run_process( .build() ); - // Spawn a task to resolve policy binary symlinks after the container - // filesystem becomes accessible via /proc//root/. This expands - // symlinks like /usr/bin/python3 → /usr/bin/python3.11 in the OPA - // policy data so that either path matches at evaluation time. - // - // We cannot do this synchronously here because the child process has - // just been spawned and its mount namespace / procfs entries may not - // be fully populated yet. Instead, we probe with retries until - // /proc//root/ is accessible or we exhaust attempts. - if let (Some(engine), Some(proto)) = (opa_engine, retained_proto) { - let resolve_engine = engine.clone(); - let resolve_proto = proto.clone(); - let resolve_pid = entrypoint_pid.clone(); - tokio::spawn(async move { - let pid = resolve_pid.load(Ordering::Acquire); - let probe_path = format!("/proc/{pid}/root/"); - // Retry up to 10 times with 500ms intervals (5s total). - // The child's mount namespace is typically ready within a - // few hundred ms of spawn. - for attempt in 1..=10 { - tokio::time::sleep(Duration::from_millis(500)).await; - if std::fs::metadata(&probe_path).is_ok() { - info!( - pid = pid, - attempt = attempt, - "Container filesystem accessible, resolving policy binary symlinks" - ); - match resolve_engine.reload_from_proto_with_pid(&resolve_proto, pid) { - Ok(()) => { - info!( - pid = pid, - "Policy binary symlink resolution complete \ - (check logs above for per-binary results)" - ); - } - Err(e) => { - warn!( - "Failed to rebuild OPA engine with symlink resolution \ - (non-fatal, falling back to literal path matching): {e}" - ); - } - } - return; - } - debug!( - pid = pid, - attempt = attempt, - probe_path = %probe_path, - "Container filesystem not yet accessible, retrying symlink resolution" - ); - } - warn!( - "Container filesystem /proc/{pid}/root/ not accessible after 10 attempts (5s); \ - binary symlink resolution skipped. Policy binary paths will be matched literally. \ - If binaries are symlinks, use canonical paths in your policy \ - (run 'readlink -f ' inside the sandbox)" - ); - }); - } - - // Spawn background policy poll task (gRPC mode only). - if let (Some(id), Some(endpoint), Some(engine)) = (sandbox_id, openshell_endpoint, opa_engine) { - let poll_id = id.to_string(); - let poll_endpoint = endpoint.to_string(); - let poll_engine = engine.clone(); - let poll_ocsf_enabled = ocsf_enabled.clone(); - let poll_pid = entrypoint_pid.clone(); - let poll_provider_credentials = provider_credentials.clone(); - let poll_policy_local = policy_local_ctx.clone(); - let poll_interval_secs: u64 = std::env::var("OPENSHELL_POLICY_POLL_INTERVAL_SECS") - .ok() - .and_then(|v| v.parse().ok()) - .unwrap_or(10); - let poll_ctx = PolicyPollLoopContext { - endpoint: poll_endpoint, - sandbox_id: poll_id, - opa_engine: poll_engine, - entrypoint_pid: poll_pid, - interval_secs: poll_interval_secs, - ocsf_enabled: poll_ocsf_enabled, - provider_credentials: poll_provider_credentials, - policy_local_ctx: Some(poll_policy_local), - }; - - tokio::spawn(async move { - if let Err(e) = run_policy_poll_loop(poll_ctx).await { - ocsf_emit!( - AppLifecycleBuilder::new(ocsf_ctx()) - .activity(ActivityId::Fail) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .message(format!("Policy poll loop exited with error: {e}")) - .build() - ); - } - }); - - // Spawn denial aggregator (gRPC mode only, when proxy is active). - if let Some(rx) = denial_rx { - // SubmitPolicyAnalysis resolves by sandbox *name*, not UUID. - let agg_name = sandbox_name_for_agg.map_or_else(|| id.to_string(), str::to_string); - let agg_endpoint = endpoint.to_string(); - let flush_interval_secs: u64 = std::env::var("OPENSHELL_DENIAL_FLUSH_INTERVAL_SECS") - .ok() - .and_then(|v| v.parse().ok()) - .unwrap_or(10); - - let aggregator = - openshell_supervisor_networking::denial_aggregator::DenialAggregator::new( - rx, - flush_interval_secs, - ); - - tokio::spawn(async move { - aggregator - .run(|summaries| { - let endpoint = agg_endpoint.clone(); - let sandbox_name = agg_name.clone(); - async move { - if let Err(e) = - flush_proposals_to_gateway(&endpoint, &sandbox_name, summaries) - .await - { - warn!(error = %e, "Failed to flush denial summaries to gateway"); - } - } - }) - .await; - }); - } - } - // Wait for process with optional timeout let result = if timeout_secs > 0 { if let Ok(result) = timeout(Duration::from_secs(timeout_secs), handle.wait()).await { @@ -1011,6 +873,143 @@ pub async fn run_sandbox( // listener and workload process are exposed. apply_supervisor_startup_hardening()?; + // Spawn a task to resolve policy binary symlinks after the container + // filesystem becomes accessible via /proc//root/. This expands + // symlinks like /usr/bin/python3 → /usr/bin/python3.11 in the OPA + // policy data so that either path matches at evaluation time. + // + // The task probes /proc//root/ with retries until accessible. It + // reads `entrypoint_pid` lazily, so spawning here (before `run_process` + // sets the PID) is safe — the probe loop just waits. + if let (Some(engine), Some(proto)) = (opa_engine.as_ref(), retained_proto.as_ref()) { + let resolve_engine = engine.clone(); + let resolve_proto = proto.clone(); + let resolve_pid = entrypoint_pid.clone(); + tokio::spawn(async move { + let pid = resolve_pid.load(Ordering::Acquire); + let probe_path = format!("/proc/{pid}/root/"); + // Retry up to 10 times with 500ms intervals (5s total). + // The child's mount namespace is typically ready within a + // few hundred ms of spawn. + for attempt in 1..=10 { + tokio::time::sleep(Duration::from_millis(500)).await; + if std::fs::metadata(&probe_path).is_ok() { + info!( + pid = pid, + attempt = attempt, + "Container filesystem accessible, resolving policy binary symlinks" + ); + match resolve_engine.reload_from_proto_with_pid(&resolve_proto, pid) { + Ok(()) => { + info!( + pid = pid, + "Policy binary symlink resolution complete \ + (check logs above for per-binary results)" + ); + } + Err(e) => { + warn!( + "Failed to rebuild OPA engine with symlink resolution \ + (non-fatal, falling back to literal path matching): {e}" + ); + } + } + return; + } + debug!( + pid = pid, + attempt = attempt, + probe_path = %probe_path, + "Container filesystem not yet accessible, retrying symlink resolution" + ); + } + warn!( + "Container filesystem /proc/{pid}/root/ not accessible after 10 attempts (5s); \ + binary symlink resolution skipped. Policy binary paths will be matched literally. \ + If binaries are symlinks, use canonical paths in your policy \ + (run 'readlink -f ' inside the sandbox)" + ); + }); + } + + // Spawn background policy poll task (gRPC mode only). + if let (Some(id), Some(endpoint), Some(engine)) = ( + sandbox_id.as_deref(), + openshell_endpoint.as_deref(), + opa_engine.as_ref(), + ) { + let poll_id = id.to_string(); + let poll_endpoint = endpoint.to_string(); + let poll_engine = engine.clone(); + let poll_ocsf_enabled = ocsf_enabled.clone(); + let poll_pid = entrypoint_pid.clone(); + let poll_provider_credentials = provider_credentials.clone(); + let poll_policy_local = policy_local_ctx.clone(); + let poll_interval_secs: u64 = std::env::var("OPENSHELL_POLICY_POLL_INTERVAL_SECS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(10); + let poll_ctx = PolicyPollLoopContext { + endpoint: poll_endpoint, + sandbox_id: poll_id, + opa_engine: poll_engine, + entrypoint_pid: poll_pid, + interval_secs: poll_interval_secs, + ocsf_enabled: poll_ocsf_enabled, + provider_credentials: poll_provider_credentials, + policy_local_ctx: Some(poll_policy_local), + }; + + tokio::spawn(async move { + if let Err(e) = run_policy_poll_loop(poll_ctx).await { + ocsf_emit!( + AppLifecycleBuilder::new(ocsf_ctx()) + .activity(ActivityId::Fail) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .message(format!("Policy poll loop exited with error: {e}")) + .build() + ); + } + }); + + // Spawn denial aggregator (gRPC mode only, when proxy is active). + if let Some(rx) = networking.denial_rx.take() { + // SubmitPolicyAnalysis resolves by sandbox *name*, not UUID. + let agg_name = sandbox_name_for_agg + .as_deref() + .map_or_else(|| id.to_string(), str::to_string); + let agg_endpoint = endpoint.to_string(); + let flush_interval_secs: u64 = std::env::var("OPENSHELL_DENIAL_FLUSH_INTERVAL_SECS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(10); + + let aggregator = + openshell_supervisor_networking::denial_aggregator::DenialAggregator::new( + rx, + flush_interval_secs, + ); + + tokio::spawn(async move { + aggregator + .run(|summaries| { + let endpoint = agg_endpoint.clone(); + let sandbox_name = agg_name.clone(); + async move { + if let Err(e) = + flush_proposals_to_gateway(&endpoint, &sandbox_name, summaries) + .await + { + warn!(error = %e, "Failed to flush denial summaries to gateway"); + } + } + }) + .await; + }); + } + } + let exit_code = run_process( program, args, @@ -1018,23 +1017,17 @@ pub async fn run_sandbox( timeout_secs, interactive, sandbox_id.as_deref(), - sandbox_name_for_agg.as_deref(), openshell_endpoint.as_deref(), ssh_socket_path, &policy, - opa_engine.as_ref(), - retained_proto.as_ref(), entrypoint_pid, provider_credentials, provider_env, - policy_local_ctx, - ocsf_enabled, networking.ssh_proxy_url.take(), networking.ssh_netns_fd, networking.ca_file_paths.clone(), #[cfg(target_os = "linux")] netns.as_ref(), - networking.denial_rx.take(), ) .await?; diff --git a/crates/openshell-supervisor-networking/tests/websocket_upgrade.rs b/crates/openshell-supervisor-networking/tests/websocket_upgrade.rs index 3339a42a7..dfeefc0f6 100644 --- a/crates/openshell-supervisor-networking/tests/websocket_upgrade.rs +++ b/crates/openshell-supervisor-networking/tests/websocket_upgrade.rs @@ -26,7 +26,9 @@ use futures::SinkExt; use futures::stream::StreamExt; -use openshell_supervisor_networking::l7::provider::{BodyLength, L7Provider, L7Request, RelayOutcome}; +use openshell_supervisor_networking::l7::provider::{ + BodyLength, L7Provider, L7Request, RelayOutcome, +}; use openshell_supervisor_networking::l7::rest::RestProvider; use std::collections::HashMap; use std::net::SocketAddr; From c8ad6c97311a2b7344771ea56f3c13879f41f0e5 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Sun, 31 May 2026 23:13:45 +0300 Subject: [PATCH 30/49] refactor(supervisor-process): move run_process from openshell-sandbox MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lift the workload supervision entry point (zombie reaper, SSH server spawn, supervisor session, entrypoint child spawn, exit-with-timeout) into its own module in openshell-supervisor-process. The orchestrator in openshell-sandbox now calls openshell_supervisor_process::run::run_process directly. With this move run_process names only types from openshell-core, openshell-ocsf, openshell-supervisor-process itself, std, and tokio — no openshell-supervisor-networking dependency. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 257 +--------------- .../openshell-supervisor-process/src/lib.rs | 1 + .../openshell-supervisor-process/src/run.rs | 285 ++++++++++++++++++ 3 files changed, 288 insertions(+), 255 deletions(-) create mode 100644 crates/openshell-supervisor-process/src/run.rs diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index d496ee2e8..5549598e5 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -13,13 +13,11 @@ use std::sync::Arc; use std::sync::Mutex; use std::sync::atomic::{AtomicU32, Ordering}; use std::time::Duration; -use tokio::time::timeout; use tracing::{debug, info, trace, warn}; use openshell_ocsf::{ ActionId, ActivityId, AppLifecycleBuilder, ConfigStateChangeBuilder, DetectionFindingBuilder, - DispositionId, FindingInfo, LaunchTypeId, Process as OcsfProcess, ProcessActivityBuilder, - SandboxContext, SeverityId, StateId, StatusId, ocsf_emit, + DispositionId, FindingInfo, SandboxContext, SeverityId, StateId, StatusId, ocsf_emit, }; // --------------------------------------------------------------------------- @@ -404,257 +402,6 @@ async fn run_networking( }) } -/// Run the sandbox workload: spawn the zombie reaper, the SSH server, the -/// supervisor session, the entrypoint process, the OPA symlink probe, and the -/// policy poll loop / denial aggregator; then wait for the entrypoint with an -/// optional timeout and emit the exit OCSF event. -/// -/// Networking outputs (`ssh_proxy_url`, `ssh_netns_fd`, `ca_file_paths`, -/// `netns`, `denial_rx`) are passed in individually so that this fn does not -/// depend on the `Networking` struct directly. -#[allow(clippy::too_many_arguments, clippy::similar_names)] -async fn run_process( - program: &str, - args: &[String], - workdir: Option<&str>, - timeout_secs: u64, - interactive: bool, - sandbox_id: Option<&str>, - openshell_endpoint: Option<&str>, - ssh_socket_path: Option, - policy: &SandboxPolicy, - entrypoint_pid: Arc, - provider_credentials: ProviderCredentialState, - provider_env: std::collections::HashMap, - ssh_proxy_url: Option, - ssh_netns_fd: Option, - ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, - #[cfg(target_os = "linux")] netns: Option<&NetworkNamespace>, -) -> Result { - // Zombie reaper — openshell-sandbox may run as PID 1 in containers and - // must reap orphaned grandchildren (e.g. background daemons started by - // coding agents) to prevent zombie accumulation. - // - // Use waitid(..., WNOWAIT) so we can inspect exited children before - // actually reaping them. This avoids racing explicit `child.wait()` calls - // for managed children (entrypoint and SSH session processes). - #[cfg(target_os = "linux")] - tokio::spawn(async { - use nix::sys::wait::{Id, WaitPidFlag, WaitStatus, waitid, waitpid}; - use tokio::signal::unix::{SignalKind, signal}; - use tokio::time::MissedTickBehavior; - - let mut sigchld = match signal(SignalKind::child()) { - Ok(s) => s, - Err(e) => { - tracing::warn!(error = %e, "Failed to register SIGCHLD handler for zombie reaping"); - return; - } - }; - let mut retry = tokio::time::interval(Duration::from_secs(5)); - retry.set_missed_tick_behavior(MissedTickBehavior::Skip); - - loop { - tokio::select! { - _ = sigchld.recv() => {} - _ = retry.tick() => {} - } - - loop { - let status = match waitid( - Id::All, - WaitPidFlag::WEXITED | WaitPidFlag::WNOHANG | WaitPidFlag::WNOWAIT, - ) { - Ok(WaitStatus::StillAlive) | Err(nix::errno::Errno::ECHILD) => break, - Ok(status) => status, - Err(nix::errno::Errno::EINTR) => continue, - Err(e) => { - tracing::debug!(error = %e, "waitid error during zombie reaping"); - break; - } - }; - - let Some(pid) = status.pid() else { - break; - }; - - if managed_children::is_managed(pid.as_raw()) { - // Let the explicit waiter own this child status. - break; - } - - match waitpid(pid, Some(WaitPidFlag::WNOHANG)) { - Ok(WaitStatus::StillAlive) - | Err(nix::errno::Errno::ECHILD | nix::errno::Errno::EINTR) => {} - Ok(reaped) => { - tracing::debug!(?reaped, "Reaped orphaned child process"); - } - Err(e) => { - tracing::debug!(error = %e, "waitpid error during orphan reap"); - break; - } - } - } - } - }); - - let ssh_socket_path: Option = ssh_socket_path.map(std::path::PathBuf::from); - if let Some(listen_path) = ssh_socket_path.clone() { - let policy_clone = policy.clone(); - let workdir_clone = workdir.map(str::to_string); - let proxy_url = ssh_proxy_url; - let netns_fd = ssh_netns_fd; - let ca_paths = ca_file_paths.clone(); - let provider_credentials_clone = provider_credentials.clone(); - - let (ssh_ready_tx, ssh_ready_rx) = tokio::sync::oneshot::channel(); - - tokio::spawn(async move { - if let Err(err) = openshell_supervisor_process::ssh::run_ssh_server( - listen_path, - ssh_ready_tx, - policy_clone, - workdir_clone, - netns_fd, - proxy_url, - ca_paths, - provider_credentials_clone, - ) - .await - { - ocsf_emit!( - AppLifecycleBuilder::new(ocsf_ctx()) - .activity(ActivityId::Fail) - .severity(SeverityId::Critical) - .status(StatusId::Failure) - .message(format!("SSH server failed: {err}")) - .build() - ); - } - }); - - // Wait for the SSH server to bind its socket before spawning the - // entrypoint process. This prevents exec requests from racing against - // SSH server startup when Kubernetes marks the pod Ready. - match timeout(Duration::from_secs(10), ssh_ready_rx).await { - Ok(Ok(Ok(()))) => { - ocsf_emit!( - AppLifecycleBuilder::new(ocsf_ctx()) - .activity(ActivityId::Open) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .message("SSH server is ready to accept connections") - .build() - ); - } - Ok(Ok(Err(err))) => { - return Err(err.context("SSH server failed during startup")); - } - Ok(Err(_)) => { - return Err(miette::miette!( - "SSH server task panicked before signaling ready" - )); - } - Err(_) => { - return Err(miette::miette!( - "SSH server did not start within 10 seconds" - )); - } - } - } - - // Spawn the persistent supervisor session if we have a gateway endpoint - // and sandbox identity. The session provides relay channels for SSH - // connect and ExecSandbox through the gateway. - if let (Some(endpoint), Some(id), Some(socket)) = - (openshell_endpoint, sandbox_id, ssh_socket_path.as_ref()) - { - openshell_supervisor_process::supervisor_session::spawn( - endpoint.to_string(), - id.to_string(), - socket.clone(), - ssh_netns_fd, - ); - info!("supervisor session task spawned"); - } - - #[cfg(target_os = "linux")] - let mut handle = ProcessHandle::spawn( - program, - args, - workdir, - interactive, - policy, - netns, - ca_file_paths.as_ref(), - &provider_env, - )?; - - #[cfg(not(target_os = "linux"))] - let mut handle = ProcessHandle::spawn( - program, - args, - workdir, - interactive, - policy, - ca_file_paths.as_ref(), - &provider_env, - )?; - - // Store the entrypoint PID so the proxy can resolve TCP peer identity - entrypoint_pid.store(handle.pid(), Ordering::Release); - ocsf_emit!( - ProcessActivityBuilder::new(ocsf_ctx()) - .activity(ActivityId::Open) - .action(ActionId::Allowed) - .disposition(DispositionId::Allowed) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .launch_type(LaunchTypeId::Spawn) - .process(OcsfProcess::new(program, i64::from(handle.pid()))) - .message(format!("Process started: pid={}", handle.pid())) - .build() - ); - - // Wait for process with optional timeout - let result = if timeout_secs > 0 { - if let Ok(result) = timeout(Duration::from_secs(timeout_secs), handle.wait()).await { - result - } else { - ocsf_emit!( - ProcessActivityBuilder::new(ocsf_ctx()) - .activity(ActivityId::Close) - .action(ActionId::Denied) - .disposition(DispositionId::Blocked) - .severity(SeverityId::Critical) - .status(StatusId::Failure) - .message("Process timed out, killing") - .build() - ); - handle.kill()?; - return Ok(124); // Standard timeout exit code - } - } else { - handle.wait().await - }; - - let status = result.into_diagnostic()?; - - ocsf_emit!( - ProcessActivityBuilder::new(ocsf_ctx()) - .activity(ActivityId::Close) - .action(ActionId::Allowed) - .disposition(DispositionId::Allowed) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .exit_code(status.code()) - .message(format!("Process exited with code {}", status.code())) - .build() - ); - - Ok(status.code()) -} - /// Run a command in the sandbox. /// /// # Errors @@ -1010,7 +757,7 @@ pub async fn run_sandbox( } } - let exit_code = run_process( + let exit_code = openshell_supervisor_process::run::run_process( program, args, workdir.as_deref(), diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs index cc5ea88be..cb4d83d60 100644 --- a/crates/openshell-supervisor-process/src/lib.rs +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -14,6 +14,7 @@ pub mod debug_rpc; pub mod log_push; pub mod managed_children; pub mod process; +pub mod run; pub mod sandbox; pub mod skills; pub mod ssh; diff --git a/crates/openshell-supervisor-process/src/run.rs b/crates/openshell-supervisor-process/src/run.rs new file mode 100644 index 000000000..fc40859f8 --- /dev/null +++ b/crates/openshell-supervisor-process/src/run.rs @@ -0,0 +1,285 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Workload supervision entry point. +//! +//! Spawns the SSH server, optional supervisor session, the entrypoint child +//! process, and waits for it to exit (with optional timeout). Long-running +//! background tasks that aren't strictly tied to the workload's lifetime +//! (policy poll loop, denial aggregator, symlink resolver) live in the +//! orchestrator, not here. + +use miette::{IntoDiagnostic, Result}; +use std::sync::Arc; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::time::Duration; +use tokio::time::timeout; +use tracing::info; + +use openshell_ocsf::{ + ActionId, ActivityId, AppLifecycleBuilder, DispositionId, LaunchTypeId, Process as OcsfProcess, + ProcessActivityBuilder, SeverityId, StatusId, ocsf_emit, +}; + +#[cfg(target_os = "linux")] +use openshell_core::netns::NetworkNamespace; +use openshell_core::policy::SandboxPolicy; +use openshell_core::provider_credentials::ProviderCredentialState; + +#[cfg(target_os = "linux")] +use crate::managed_children; +use crate::process::ProcessHandle; + +fn ocsf_ctx() -> &'static openshell_ocsf::SandboxContext { + openshell_ocsf::ctx::ctx() +} + +/// Spawn the workload entrypoint, wire up SSH and supervisor session, and +/// wait for the entrypoint child to exit. +/// +/// # Errors +/// +/// Returns an error if SSH server startup fails, if the entrypoint child +/// fails to spawn, or if waiting for the child returns an OS error. +#[allow(clippy::too_many_arguments, clippy::implicit_hasher)] +pub async fn run_process( + program: &str, + args: &[String], + workdir: Option<&str>, + timeout_secs: u64, + interactive: bool, + sandbox_id: Option<&str>, + openshell_endpoint: Option<&str>, + ssh_socket_path: Option, + policy: &SandboxPolicy, + entrypoint_pid: Arc, + provider_credentials: ProviderCredentialState, + provider_env: std::collections::HashMap, + ssh_proxy_url: Option, + ssh_netns_fd: Option, + ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, + #[cfg(target_os = "linux")] netns: Option<&NetworkNamespace>, +) -> Result { + // Zombie reaper — openshell-sandbox may run as PID 1 in containers and + // must reap orphaned grandchildren (e.g. background daemons started by + // coding agents) to prevent zombie accumulation. + // + // Use waitid(..., WNOWAIT) so we can inspect exited children before + // actually reaping them. This avoids racing explicit `child.wait()` calls + // for managed children (entrypoint and SSH session processes). + #[cfg(target_os = "linux")] + tokio::spawn(async { + use nix::sys::wait::{Id, WaitPidFlag, WaitStatus, waitid, waitpid}; + use tokio::signal::unix::{SignalKind, signal}; + use tokio::time::MissedTickBehavior; + + let mut sigchld = match signal(SignalKind::child()) { + Ok(s) => s, + Err(e) => { + tracing::warn!(error = %e, "Failed to register SIGCHLD handler for zombie reaping"); + return; + } + }; + let mut retry = tokio::time::interval(Duration::from_secs(5)); + retry.set_missed_tick_behavior(MissedTickBehavior::Skip); + + loop { + tokio::select! { + _ = sigchld.recv() => {} + _ = retry.tick() => {} + } + + loop { + let status = match waitid( + Id::All, + WaitPidFlag::WEXITED | WaitPidFlag::WNOHANG | WaitPidFlag::WNOWAIT, + ) { + Ok(WaitStatus::StillAlive) | Err(nix::errno::Errno::ECHILD) => break, + Ok(status) => status, + Err(nix::errno::Errno::EINTR) => continue, + Err(e) => { + tracing::debug!(error = %e, "waitid error during zombie reaping"); + break; + } + }; + + let Some(pid) = status.pid() else { + break; + }; + + if managed_children::is_managed(pid.as_raw()) { + // Let the explicit waiter own this child status. + break; + } + + match waitpid(pid, Some(WaitPidFlag::WNOHANG)) { + Ok(WaitStatus::StillAlive) + | Err(nix::errno::Errno::ECHILD | nix::errno::Errno::EINTR) => {} + Ok(reaped) => { + tracing::debug!(?reaped, "Reaped orphaned child process"); + } + Err(e) => { + tracing::debug!(error = %e, "waitpid error during orphan reap"); + break; + } + } + } + } + }); + + let ssh_socket_path: Option = ssh_socket_path.map(std::path::PathBuf::from); + if let Some(listen_path) = ssh_socket_path.clone() { + let policy_clone = policy.clone(); + let workdir_clone = workdir.map(str::to_string); + let proxy_url = ssh_proxy_url; + let netns_fd = ssh_netns_fd; + let ca_paths = ca_file_paths.clone(); + let provider_credentials_clone = provider_credentials.clone(); + + let (ssh_ready_tx, ssh_ready_rx) = tokio::sync::oneshot::channel(); + + tokio::spawn(async move { + if let Err(err) = crate::ssh::run_ssh_server( + listen_path, + ssh_ready_tx, + policy_clone, + workdir_clone, + netns_fd, + proxy_url, + ca_paths, + provider_credentials_clone, + ) + .await + { + ocsf_emit!( + AppLifecycleBuilder::new(ocsf_ctx()) + .activity(ActivityId::Fail) + .severity(SeverityId::Critical) + .status(StatusId::Failure) + .message(format!("SSH server failed: {err}")) + .build() + ); + } + }); + + // Wait for the SSH server to bind its socket before spawning the + // entrypoint process. This prevents exec requests from racing against + // SSH server startup when Kubernetes marks the pod Ready. + match timeout(Duration::from_secs(10), ssh_ready_rx).await { + Ok(Ok(Ok(()))) => { + ocsf_emit!( + AppLifecycleBuilder::new(ocsf_ctx()) + .activity(ActivityId::Open) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .message("SSH server is ready to accept connections") + .build() + ); + } + Ok(Ok(Err(err))) => { + return Err(err.context("SSH server failed during startup")); + } + Ok(Err(_)) => { + return Err(miette::miette!( + "SSH server task panicked before signaling ready" + )); + } + Err(_) => { + return Err(miette::miette!( + "SSH server did not start within 10 seconds" + )); + } + } + } + + // Spawn the persistent supervisor session if we have a gateway endpoint + // and sandbox identity. The session provides relay channels for SSH + // connect and ExecSandbox through the gateway. + if let (Some(endpoint), Some(id), Some(socket)) = + (openshell_endpoint, sandbox_id, ssh_socket_path.as_ref()) + { + crate::supervisor_session::spawn( + endpoint.to_string(), + id.to_string(), + socket.clone(), + ssh_netns_fd, + ); + info!("supervisor session task spawned"); + } + + #[cfg(target_os = "linux")] + let mut handle = ProcessHandle::spawn( + program, + args, + workdir, + interactive, + policy, + netns, + ca_file_paths.as_ref(), + &provider_env, + )?; + + #[cfg(not(target_os = "linux"))] + let mut handle = ProcessHandle::spawn( + program, + args, + workdir, + interactive, + policy, + ca_file_paths.as_ref(), + &provider_env, + )?; + + // Store the entrypoint PID so the proxy can resolve TCP peer identity + entrypoint_pid.store(handle.pid(), Ordering::Release); + ocsf_emit!( + ProcessActivityBuilder::new(ocsf_ctx()) + .activity(ActivityId::Open) + .action(ActionId::Allowed) + .disposition(DispositionId::Allowed) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .launch_type(LaunchTypeId::Spawn) + .process(OcsfProcess::new(program, i64::from(handle.pid()))) + .message(format!("Process started: pid={}", handle.pid())) + .build() + ); + + // Wait for process with optional timeout + let result = if timeout_secs > 0 { + if let Ok(result) = timeout(Duration::from_secs(timeout_secs), handle.wait()).await { + result + } else { + ocsf_emit!( + ProcessActivityBuilder::new(ocsf_ctx()) + .activity(ActivityId::Close) + .action(ActionId::Denied) + .disposition(DispositionId::Blocked) + .severity(SeverityId::Critical) + .status(StatusId::Failure) + .message("Process timed out, killing") + .build() + ); + handle.kill()?; + return Ok(124); // Standard timeout exit code + } + } else { + handle.wait().await + }; + + let status = result.into_diagnostic()?; + + ocsf_emit!( + ProcessActivityBuilder::new(ocsf_ctx()) + .activity(ActivityId::Close) + .action(ActionId::Allowed) + .disposition(DispositionId::Allowed) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .exit_code(status.code()) + .message(format!("Process exited with code {}", status.code())) + .build() + ); + + Ok(status.code()) +} From 49e9b27748e456ddc441a65a1bfff06306ac4f5e Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 08:16:26 +0300 Subject: [PATCH 31/49] refactor(supervisor-networking): move bypass_monitor from supervisor-process Bypass detection is network-policy enforcement: it parses nftables LOG entries from /dev/kmsg and emits OCSF NetworkActivity / DetectionFinding events plus DenialEvents into the same channel the proxy feeds. Its lifetime is tied to the network namespace, not to the workload child. Moving it to openshell-supervisor-networking puts it next to the proxy and the denial aggregator that consume its output, and unblocks moving run_networking out of openshell-sandbox without a leaf-to-leaf dep. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 2 +- .../src/bypass_monitor.rs | 0 crates/openshell-supervisor-networking/src/lib.rs | 1 + crates/openshell-supervisor-process/src/lib.rs | 1 - 4 files changed, 2 insertions(+), 2 deletions(-) rename crates/{openshell-supervisor-process => openshell-supervisor-networking}/src/bypass_monitor.rs (100%) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 5549598e5..c502f8d54 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -342,7 +342,7 @@ async fn run_networking( // tracing events for direct connection attempts that bypass the proxy. #[cfg(target_os = "linux")] let bypass_monitor_handle = netns.and_then(|ns| { - openshell_supervisor_process::bypass_monitor::spawn( + openshell_supervisor_networking::bypass_monitor::spawn( ns.name().to_string(), entrypoint_pid.clone(), bypass_denial_tx, diff --git a/crates/openshell-supervisor-process/src/bypass_monitor.rs b/crates/openshell-supervisor-networking/src/bypass_monitor.rs similarity index 100% rename from crates/openshell-supervisor-process/src/bypass_monitor.rs rename to crates/openshell-supervisor-networking/src/bypass_monitor.rs diff --git a/crates/openshell-supervisor-networking/src/lib.rs b/crates/openshell-supervisor-networking/src/lib.rs index fd81e83f3..fead9207c 100644 --- a/crates/openshell-supervisor-networking/src/lib.rs +++ b/crates/openshell-supervisor-networking/src/lib.rs @@ -7,6 +7,7 @@ //! inference routing, TLS interception, and denial aggregation. Populated by //! follow-up commits as modules migrate out of `openshell-sandbox`. +pub mod bypass_monitor; pub mod denial_aggregator; pub mod identity; pub mod l7; diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs index cb4d83d60..d7401f8a1 100644 --- a/crates/openshell-supervisor-process/src/lib.rs +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -8,7 +8,6 @@ //! and log push. Populated by follow-up commits as modules migrate out of //! `openshell-sandbox`. -pub mod bypass_monitor; pub mod child_env; pub mod debug_rpc; pub mod log_push; From 145a4ad7ce256226eb48fa689a9c8e6a5b60c6bf Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 09:24:47 +0300 Subject: [PATCH 32/49] refactor(supervisor-networking): move inference route helpers from openshell-sandbox Move build_inference_context, partition_routes, bundle_to_resolved_routes, spawn_route_refresh, the InferenceRouteSource enum, and the route refresh interval helpers into a new openshell-supervisor-networking::inference_routes module along with their unit tests. The orchestrator now calls into the networking leaf for inference context construction; the leaf owns its own route bundle resolution end-to-end. The new module is named inference_routes to avoid colliding with the existing l7::inference module, which handles request-time HTTP parsing and pattern matching rather than route bundle setup. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 711 +---------------- .../src/inference_routes.rs | 737 ++++++++++++++++++ .../src/lib.rs | 1 + 3 files changed, 745 insertions(+), 704 deletions(-) create mode 100644 crates/openshell-supervisor-networking/src/inference_routes.rs diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index c502f8d54..20747a599 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -9,11 +9,9 @@ use miette::{IntoDiagnostic, Result}; use std::future::Future; use std::net::SocketAddr; use std::sync::Arc; -#[cfg(any(target_os = "linux", test))] -use std::sync::Mutex; use std::sync::atomic::{AtomicU32, Ordering}; use std::time::Duration; -use tracing::{debug, info, trace, warn}; +use tracing::{debug, info, warn}; use openshell_ocsf::{ ActionId, ActivityId, AppLifecycleBuilder, ConfigStateChangeBuilder, DetectionFindingBuilder, @@ -71,63 +69,6 @@ pub use openshell_supervisor_process::process::{ProcessHandle, ProcessStatus}; pub use openshell_supervisor_process::sandbox::apply_supervisor_startup_hardening; use openshell_supervisor_process::skills; -/// Default interval (seconds) for re-fetching the inference route bundle from -/// the gateway in cluster mode. Override at runtime with the -/// `OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS` environment variable. -/// File-based routes (`--inference-routes`) are loaded once at startup and never -/// refreshed. -const DEFAULT_ROUTE_REFRESH_INTERVAL_SECS: u64 = 5; - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum InferenceRouteSource { - File, - Cluster, - None, -} - -fn infer_route_source( - sandbox_id: Option<&str>, - openshell_endpoint: Option<&str>, - inference_routes: Option<&str>, -) -> InferenceRouteSource { - if inference_routes.is_some() { - InferenceRouteSource::File - } else if sandbox_id.is_some() && openshell_endpoint.is_some() { - InferenceRouteSource::Cluster - } else { - InferenceRouteSource::None - } -} - -fn disable_inference_on_empty_routes(source: InferenceRouteSource) -> bool { - !matches!(source, InferenceRouteSource::Cluster) -} - -fn route_refresh_interval_secs() -> u64 { - let Ok(value) = std::env::var("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS") else { - return DEFAULT_ROUTE_REFRESH_INTERVAL_SECS; - }; - match value.parse::() { - Ok(interval) if interval > 0 => interval, - Ok(_) => { - warn!( - default_interval_secs = DEFAULT_ROUTE_REFRESH_INTERVAL_SECS, - "Ignoring zero route refresh interval" - ); - DEFAULT_ROUTE_REFRESH_INTERVAL_SECS - } - Err(error) => { - warn!( - interval = %value, - error = %error, - default_interval_secs = DEFAULT_ROUTE_REFRESH_INTERVAL_SECS, - "Ignoring invalid route refresh interval" - ); - DEFAULT_ROUTE_REFRESH_INTERVAL_SECS - } - } -} - #[cfg(target_os = "linux")] use openshell_supervisor_process::managed_children; @@ -307,7 +248,12 @@ async fn run_networking( // Build inference context for local routing of intercepted inference calls. let inference_ctx = - build_inference_context(sandbox_id, openshell_endpoint, inference_routes).await?; + openshell_supervisor_networking::inference_routes::build_inference_context( + sandbox_id, + openshell_endpoint, + inference_routes, + ) + .await?; // Create denial aggregator channel if in gRPC mode (sandbox_id present). // Clone the sender for the bypass monitor before passing to the proxy. @@ -781,319 +727,6 @@ pub async fn run_sandbox( Ok(exit_code) } -/// Build an inference context for local routing, if route sources are available. -/// -/// Route sources (in priority order): -/// 1. Inference routes file (standalone mode) — always takes precedence -/// 2. Cluster bundle (fetched from gateway via gRPC) -/// -/// If both a routes file and cluster credentials are provided, the routes file -/// wins and the cluster bundle is not fetched. -/// -/// Returns `None` if neither source is configured (inference routing disabled). -// `routes`/`router` are intentionally distinct nouns (the route list vs the -// router that consumes them); both names are clearer than alternatives. -#[allow(clippy::similar_names)] -async fn build_inference_context( - sandbox_id: Option<&str>, - openshell_endpoint: Option<&str>, - inference_routes: Option<&str>, -) -> Result>> { - use openshell_router::Router; - use openshell_router::config::RouterConfig; - - let source = infer_route_source(sandbox_id, openshell_endpoint, inference_routes); - - // Captured during the initial cluster bundle fetch so the background refresh - // loop can skip no-op updates from the very first tick. - let mut initial_revision: Option = None; - - let routes = match source { - InferenceRouteSource::File => { - let Some(path) = inference_routes else { - return Ok(None); - }; - - // Standalone mode: load routes from file (fail-fast on errors) - if sandbox_id.is_some() { - ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Enabled, "loaded") - .unmapped("inference_routes", serde_json::json!(path)) - .message(format!( - "Inference routes file takes precedence over cluster bundle [path:{path}]" - )) - .build()); - } - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Other, "loading") - .unmapped("inference_routes", serde_json::json!(path)) - .message(format!("Loading inference routes from file [path:{path}]")) - .build() - ); - let config = RouterConfig::load_from_file(std::path::Path::new(path)) - .map_err(|e| miette::miette!("failed to load inference routes {path}: {e}"))?; - config - .resolve_routes() - .map_err(|e| miette::miette!("failed to resolve routes from {path}: {e}"))? - } - InferenceRouteSource::Cluster => { - let (Some(_id), Some(endpoint)) = (sandbox_id, openshell_endpoint) else { - return Ok(None); - }; - - // Cluster mode: fetch bundle from gateway - info!(endpoint = %endpoint, "Fetching inference route bundle from gateway"); - match openshell_core::grpc_client::fetch_inference_bundle(endpoint).await { - Ok(bundle) => { - initial_revision = Some(bundle.revision.clone()); - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Enabled, "loaded") - .unmapped("route_count", serde_json::json!(bundle.routes.len())) - .unmapped("revision", serde_json::json!(&bundle.revision)) - .message(format!( - "Loaded inference route bundle [route_count:{} revision:{}]", - bundle.routes.len(), - bundle.revision - )) - .build() - ); - bundle_to_resolved_routes(&bundle) - } - Err(e) => { - // Distinguish expected "not configured" states from server errors. - // gRPC PermissionDenied/NotFound means inference bundle is unavailable - // for this sandbox — skip gracefully. Other errors are unexpected. - let msg = e.to_string(); - if msg.contains("permission denied") || msg.contains("not found") { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Disabled, "disabled") - .unmapped("error", serde_json::json!(e.to_string())) - .message(format!( - "Inference bundle unavailable, routing disabled [error:{e}]" - )) - .build() - ); - return Ok(None); - } - ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .state(StateId::Disabled, "disabled") - .unmapped("error", serde_json::json!(e.to_string())) - .message(format!( - "Failed to fetch inference bundle, inference routing disabled [error:{e}]" - )) - .build()); - return Ok(None); - } - } - } - InferenceRouteSource::None => { - // No route source — inference routing is not configured - return Ok(None); - } - }; - - if routes.is_empty() && disable_inference_on_empty_routes(source) { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Disabled, "disabled") - .message("No usable inference routes, inference routing disabled") - .build() - ); - return Ok(None); - } - - if routes.is_empty() { - ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Other, "waiting") - .message("Inference route bundle is empty; keeping routing enabled and waiting for refresh") - .build()); - } - - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Enabled, "enabled") - .unmapped("route_count", serde_json::json!(routes.len())) - .message(format!( - "Inference routing enabled with local execution [route_count:{}]", - routes.len() - )) - .build() - ); - - // Partition routes by name into user-facing and system caches. - let (user_routes, system_routes) = partition_routes(routes); - - let router = - Router::new().map_err(|e| miette::miette!("failed to initialize inference router: {e}"))?; - let patterns = openshell_supervisor_networking::l7::inference::default_patterns(); - - let ctx = Arc::new( - openshell_supervisor_networking::proxy::InferenceContext::new( - patterns, - router, - user_routes, - system_routes, - ), - ); - - // Spawn background route cache refresh for cluster mode at startup so - // request handling never depends on control-plane latency. - if matches!(source, InferenceRouteSource::Cluster) - && let (Some(_id), Some(endpoint)) = (sandbox_id, openshell_endpoint) - { - spawn_route_refresh( - ctx.route_cache(), - ctx.system_route_cache(), - endpoint.to_string(), - route_refresh_interval_secs(), - initial_revision, - ); - } - - Ok(Some(ctx)) -} - -/// Route name for the sandbox system inference route. -const SANDBOX_SYSTEM_ROUTE_NAME: &str = "sandbox-system"; - -/// Split resolved routes into user-facing and system caches by route name. -/// -/// Routes named `"sandbox-system"` go to the system cache; everything else -/// (including `"inference.local"` and empty names) goes to the user cache. -fn partition_routes( - routes: Vec, -) -> ( - Vec, - Vec, -) { - let mut user = Vec::new(); - let mut system = Vec::new(); - for r in routes { - if r.name == SANDBOX_SYSTEM_ROUTE_NAME { - system.push(r); - } else { - user.push(r); - } - } - (user, system) -} - -/// Convert a proto bundle response into resolved routes for the router. -pub(crate) fn bundle_to_resolved_routes( - bundle: &openshell_core::proto::GetInferenceBundleResponse, -) -> Vec { - bundle - .routes - .iter() - .map(|r| { - let (auth, default_headers, passthrough_headers) = - openshell_core::inference::route_headers_for_provider_type(&r.provider_type); - let timeout = if r.timeout_secs == 0 { - openshell_router::config::DEFAULT_ROUTE_TIMEOUT - } else { - Duration::from_secs(r.timeout_secs) - }; - openshell_router::config::ResolvedRoute { - name: r.name.clone(), - endpoint: r.base_url.clone(), - model: r.model_id.clone(), - api_key: r.api_key.clone(), - protocols: r.protocols.clone(), - auth, - default_headers, - passthrough_headers, - timeout, - } - }) - .collect() -} - -/// Spawn a background task that periodically refreshes both route caches from the gateway. -/// -/// The loop uses the bundle `revision` hash to avoid unnecessary cache writes -/// when routes haven't changed. `initial_revision` is the revision captured -/// during the startup fetch in [`build_inference_context`] so the first refresh -/// cycle can already skip a no-op update. -pub(crate) fn spawn_route_refresh( - user_cache: Arc>>, - system_cache: Arc>>, - endpoint: String, - interval_secs: u64, - initial_revision: Option, -) { - tokio::spawn(async move { - use tokio::time::{MissedTickBehavior, interval}; - - let mut current_revision = initial_revision; - - let mut tick = interval(Duration::from_secs(interval_secs)); - tick.set_missed_tick_behavior(MissedTickBehavior::Skip); - - loop { - tick.tick().await; - - match openshell_core::grpc_client::fetch_inference_bundle(&endpoint).await { - Ok(bundle) => { - if current_revision.as_deref() == Some(&bundle.revision) { - trace!(revision = %bundle.revision, "Inference bundle unchanged"); - continue; - } - - let routes = bundle_to_resolved_routes(&bundle); - let (user_routes, system_routes) = partition_routes(routes); - ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Enabled, "updated") - .unmapped("user_route_count", serde_json::json!(user_routes.len())) - .unmapped("system_route_count", serde_json::json!(system_routes.len())) - .unmapped("revision", serde_json::json!(&bundle.revision)) - .message(format!( - "Inference routes updated [user_route_count:{} system_route_count:{} revision:{}]", - user_routes.len(), - system_routes.len(), - bundle.revision - )) - .build()); - current_revision = Some(bundle.revision); - *user_cache.write().await = user_routes; - *system_cache.write().await = system_routes; - } - Err(e) => { - ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .state(StateId::Other, "stale") - .unmapped("error", serde_json::json!(e.to_string())) - .message(format!( - "Failed to refresh inference route cache, keeping stale routes [error:{e}]" - )) - .build()); - } - } - } - }); -} - // ============================================================================ // Baseline filesystem path enrichment // ============================================================================ @@ -2467,263 +2100,6 @@ mod tests { use openshell_core::policy::{FilesystemPolicy, LandlockPolicy, ProcessPolicy}; #[cfg(unix)] use std::os::unix::fs::{MetadataExt, symlink}; - use std::sync::LazyLock; - use temp_env::with_vars; - - static ENV_LOCK: LazyLock> = LazyLock::new(|| Mutex::new(())); - - #[test] - fn bundle_to_resolved_routes_converts_all_fields() { - let bundle = openshell_core::proto::GetInferenceBundleResponse { - routes: vec![ - openshell_core::proto::ResolvedRoute { - name: "frontier".to_string(), - base_url: "https://api.example.com/v1".to_string(), - api_key: "sk-test-key".to_string(), - model_id: "gpt-4".to_string(), - protocols: vec![ - "openai_chat_completions".to_string(), - "openai_responses".to_string(), - ], - provider_type: "openai".to_string(), - timeout_secs: 0, - }, - openshell_core::proto::ResolvedRoute { - name: "local".to_string(), - base_url: "http://vllm:8000/v1".to_string(), - api_key: "local-key".to_string(), - model_id: "llama-3".to_string(), - protocols: vec!["openai_chat_completions".to_string()], - provider_type: String::new(), - timeout_secs: 120, - }, - ], - revision: "abc123".to_string(), - generated_at_ms: 1000, - }; - - let routes = bundle_to_resolved_routes(&bundle); - - assert_eq!(routes.len(), 2); - assert_eq!(routes[0].endpoint, "https://api.example.com/v1"); - assert_eq!(routes[0].model, "gpt-4"); - assert_eq!(routes[0].api_key, "sk-test-key"); - assert_eq!( - routes[0].auth, - openshell_core::inference::AuthHeader::Bearer - ); - assert_eq!( - routes[0].protocols, - vec!["openai_chat_completions", "openai_responses"] - ); - assert_eq!( - routes[0].timeout, - openshell_router::config::DEFAULT_ROUTE_TIMEOUT, - "timeout_secs=0 should map to default" - ); - assert_eq!(routes[1].endpoint, "http://vllm:8000/v1"); - assert_eq!( - routes[1].auth, - openshell_core::inference::AuthHeader::Bearer - ); - assert_eq!( - routes[1].timeout, - Duration::from_secs(120), - "timeout_secs=120 should map to 120s" - ); - } - - #[test] - fn bundle_to_resolved_routes_handles_empty_bundle() { - let bundle = openshell_core::proto::GetInferenceBundleResponse { - routes: vec![], - revision: "empty".to_string(), - generated_at_ms: 0, - }; - - let routes = bundle_to_resolved_routes(&bundle); - assert!(routes.is_empty()); - } - - #[test] - fn bundle_to_resolved_routes_preserves_name_field() { - let bundle = openshell_core::proto::GetInferenceBundleResponse { - routes: vec![openshell_core::proto::ResolvedRoute { - name: "sandbox-system".to_string(), - base_url: "https://api.example.com/v1".to_string(), - api_key: "key".to_string(), - model_id: "model".to_string(), - protocols: vec!["openai_chat_completions".to_string()], - provider_type: "openai".to_string(), - timeout_secs: 0, - }], - revision: "rev".to_string(), - generated_at_ms: 0, - }; - - let routes = bundle_to_resolved_routes(&bundle); - assert_eq!(routes[0].name, "sandbox-system"); - } - - #[test] - fn routes_segregated_by_name() { - let routes = vec![ - openshell_router::config::ResolvedRoute { - name: "inference.local".to_string(), - endpoint: "https://api.openai.com/v1".to_string(), - model: "gpt-4o".to_string(), - api_key: "key1".to_string(), - protocols: vec!["openai_chat_completions".to_string()], - auth: openshell_core::inference::AuthHeader::Bearer, - default_headers: vec![], - passthrough_headers: vec![], - timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, - }, - openshell_router::config::ResolvedRoute { - name: "sandbox-system".to_string(), - endpoint: "https://api.anthropic.com/v1".to_string(), - model: "claude-sonnet-4-20250514".to_string(), - api_key: "key2".to_string(), - protocols: vec!["anthropic_messages".to_string()], - auth: openshell_core::inference::AuthHeader::Custom("x-api-key"), - default_headers: vec![], - passthrough_headers: vec![], - timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, - }, - ]; - - let (user, system) = partition_routes(routes); - assert_eq!(user.len(), 1); - assert_eq!(user[0].name, "inference.local"); - assert_eq!(system.len(), 1); - assert_eq!(system[0].name, "sandbox-system"); - } - - // -- build_inference_context tests -- - - #[tokio::test] - async fn build_inference_context_route_file_loads_routes() { - use std::io::Write; - - let yaml = r#" -routes: - - name: inference.local - endpoint: http://localhost:8000/v1 - model: llama-3 - protocols: [openai_chat_completions] - api_key: test-key -"#; - let mut f = tempfile::NamedTempFile::new().unwrap(); - f.write_all(yaml.as_bytes()).unwrap(); - let path = f.path().to_str().unwrap(); - - let ctx = build_inference_context(None, None, Some(path)) - .await - .expect("should load routes from file"); - - let ctx = ctx.expect("context should be Some"); - let cache = ctx.route_cache(); - let routes = cache.read().await; - assert_eq!(routes.len(), 1); - assert_eq!(routes[0].endpoint, "http://localhost:8000/v1"); - } - - #[tokio::test] - async fn build_inference_context_empty_route_file_returns_none() { - use std::io::Write; - - // Route file with empty routes list → inference routing disabled (not an error) - let yaml = "routes: []\n"; - let mut f = tempfile::NamedTempFile::new().unwrap(); - f.write_all(yaml.as_bytes()).unwrap(); - let path = f.path().to_str().unwrap(); - - let ctx = build_inference_context(None, None, Some(path)) - .await - .expect("empty routes file should not error"); - assert!( - ctx.is_none(), - "empty routes should disable inference routing" - ); - } - - #[tokio::test] - async fn build_inference_context_no_sources_returns_none() { - let ctx = build_inference_context(None, None, None) - .await - .expect("should succeed with None"); - - assert!(ctx.is_none(), "no sources should return None"); - } - - #[tokio::test] - async fn build_inference_context_route_file_overrides_cluster() { - use std::io::Write; - - let yaml = r#" -routes: - - name: inference.local - endpoint: http://localhost:9999/v1 - model: file-model - protocols: [openai_chat_completions] - api_key: file-key -"#; - let mut f = tempfile::NamedTempFile::new().unwrap(); - f.write_all(yaml.as_bytes()).unwrap(); - let path = f.path().to_str().unwrap(); - - // Even with sandbox_id and endpoint, route_file takes precedence - let ctx = build_inference_context(Some("sb-1"), Some("http://localhost:50051"), Some(path)) - .await - .expect("should load from file"); - - let ctx = ctx.expect("context should be Some"); - let cache = ctx.route_cache(); - let routes = cache.read().await; - assert_eq!(routes[0].endpoint, "http://localhost:9999/v1"); - } - - #[test] - fn infer_route_source_prefers_file_mode() { - assert_eq!( - infer_route_source( - Some("sb-1"), - Some("http://localhost:50051"), - Some("routes.yaml") - ), - InferenceRouteSource::File - ); - } - - #[test] - fn infer_route_source_cluster_requires_id_and_endpoint() { - assert_eq!( - infer_route_source(Some("sb-1"), Some("http://localhost:50051"), None), - InferenceRouteSource::Cluster - ); - assert_eq!( - infer_route_source(Some("sb-1"), None, None), - InferenceRouteSource::None - ); - assert_eq!( - infer_route_source(None, Some("http://localhost:50051"), None), - InferenceRouteSource::None - ); - } - - #[test] - fn disable_inference_on_empty_routes_depends_on_source() { - assert!(disable_inference_on_empty_routes( - InferenceRouteSource::File - )); - assert!(!disable_inference_on_empty_routes( - InferenceRouteSource::Cluster - )); - assert!(disable_inference_on_empty_routes( - InferenceRouteSource::None - )); - } - // ---- Policy disk discovery tests ---- #[test] @@ -2818,79 +2194,6 @@ filesystem_policy: assert!(matches!(local_policy.network.mode, NetworkMode::Proxy)); } - // ---- Route refresh interval + revision tests ---- - - #[test] - fn default_route_refresh_interval_is_five_seconds() { - assert_eq!(DEFAULT_ROUTE_REFRESH_INTERVAL_SECS, 5); - } - - #[test] - fn route_refresh_interval_uses_env_override() { - let _guard = ENV_LOCK.lock().unwrap(); - with_vars( - [("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS", Some("9"))], - || { - assert_eq!(route_refresh_interval_secs(), 9); - }, - ); - } - - #[test] - fn route_refresh_interval_rejects_zero() { - let _guard = ENV_LOCK.lock().unwrap(); - with_vars( - [("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS", Some("0"))], - || { - assert_eq!( - route_refresh_interval_secs(), - DEFAULT_ROUTE_REFRESH_INTERVAL_SECS - ); - }, - ); - } - - #[test] - fn route_refresh_interval_rejects_invalid_values() { - let _guard = ENV_LOCK.lock().unwrap(); - with_vars( - [("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS", Some("abc"))], - || { - assert_eq!( - route_refresh_interval_secs(), - DEFAULT_ROUTE_REFRESH_INTERVAL_SECS - ); - }, - ); - } - - #[tokio::test] - async fn route_cache_preserves_content_when_not_written() { - use std::sync::Arc; - use tokio::sync::RwLock; - - let routes = vec![openshell_router::config::ResolvedRoute { - name: "inference.local".to_string(), - endpoint: "http://original:8000/v1".to_string(), - model: "original-model".to_string(), - api_key: "key".to_string(), - auth: openshell_core::inference::AuthHeader::Bearer, - protocols: vec!["openai_chat_completions".to_string()], - default_headers: vec![], - passthrough_headers: vec![], - timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, - }]; - - let cache = Arc::new(RwLock::new(routes)); - - // Verify the cache preserves its content — the revision-based skip - // logic in spawn_route_refresh ensures the cache is only written - // when the revision actually changes. - let read = cache.read().await; - assert_eq!(read.len(), 1); - assert_eq!(read[0].model, "original-model"); - } - #[cfg(unix)] fn sandbox_policy_with_read_write( path: std::path::PathBuf, diff --git a/crates/openshell-supervisor-networking/src/inference_routes.rs b/crates/openshell-supervisor-networking/src/inference_routes.rs new file mode 100644 index 000000000..c9f9983ee --- /dev/null +++ b/crates/openshell-supervisor-networking/src/inference_routes.rs @@ -0,0 +1,737 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Inference route bundle resolution and refresh. +//! +//! Resolves inference routes from one of two sources at sandbox startup: +//! a local YAML file (`--inference-routes`) or a cluster bundle fetched via +//! gRPC. Builds the [`InferenceContext`] consumed by the proxy's L7 layer +//! and spawns a background refresh loop in cluster mode so route changes +//! propagate without restarting the sandbox. +//! +//! Distinct from [`crate::l7::inference`], which parses HTTP requests and +//! matches them against API patterns at request time. +//! +//! [`InferenceContext`]: crate::proxy::InferenceContext + +use std::sync::Arc; +use std::time::Duration; + +use miette::Result; +use tracing::{info, trace, warn}; + +use openshell_ocsf::{ + ConfigStateChangeBuilder, SeverityId, StateId, StatusId, ctx::ctx as ocsf_ctx, ocsf_emit, +}; + +/// Default interval (seconds) for re-fetching the inference route bundle from +/// the gateway in cluster mode. +/// +/// Override at runtime with the `OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS` +/// environment variable. File-based routes (`--inference-routes`) are loaded +/// once at startup and never refreshed. +pub const DEFAULT_ROUTE_REFRESH_INTERVAL_SECS: u64 = 5; + +/// Route name for the sandbox system inference route. +const SANDBOX_SYSTEM_ROUTE_NAME: &str = "sandbox-system"; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum InferenceRouteSource { + File, + Cluster, + None, +} + +pub fn infer_route_source( + sandbox_id: Option<&str>, + openshell_endpoint: Option<&str>, + inference_routes: Option<&str>, +) -> InferenceRouteSource { + if inference_routes.is_some() { + InferenceRouteSource::File + } else if sandbox_id.is_some() && openshell_endpoint.is_some() { + InferenceRouteSource::Cluster + } else { + InferenceRouteSource::None + } +} + +pub fn disable_inference_on_empty_routes(source: InferenceRouteSource) -> bool { + !matches!(source, InferenceRouteSource::Cluster) +} + +pub fn route_refresh_interval_secs() -> u64 { + let Ok(value) = std::env::var("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS") else { + return DEFAULT_ROUTE_REFRESH_INTERVAL_SECS; + }; + match value.parse::() { + Ok(interval) if interval > 0 => interval, + Ok(_) => { + warn!( + default_interval_secs = DEFAULT_ROUTE_REFRESH_INTERVAL_SECS, + "Ignoring zero route refresh interval" + ); + DEFAULT_ROUTE_REFRESH_INTERVAL_SECS + } + Err(error) => { + warn!( + interval = %value, + error = %error, + default_interval_secs = DEFAULT_ROUTE_REFRESH_INTERVAL_SECS, + "Ignoring invalid route refresh interval" + ); + DEFAULT_ROUTE_REFRESH_INTERVAL_SECS + } + } +} + +/// Build an [`InferenceContext`](crate::proxy::InferenceContext) by resolving +/// inference routes from either a local YAML file or the gateway bundle. +/// +/// If both a routes file and cluster credentials are provided, the routes file +/// wins and the cluster bundle is not fetched. +/// +/// Returns `None` if neither source is configured (inference routing disabled). +/// +/// # Errors +/// +/// Returns an error if loading the routes file fails or the file's routes +/// cannot be resolved. gRPC errors are swallowed (logged) and produce +/// `Ok(None)` so a missing cluster bundle disables inference routing rather +/// than aborting sandbox startup. +// `routes`/`router` are intentionally distinct nouns (the route list vs the +// router that consumes them); both names are clearer than alternatives. +#[allow(clippy::similar_names)] +pub async fn build_inference_context( + sandbox_id: Option<&str>, + openshell_endpoint: Option<&str>, + inference_routes: Option<&str>, +) -> Result>> { + use openshell_router::Router; + use openshell_router::config::RouterConfig; + + let source = infer_route_source(sandbox_id, openshell_endpoint, inference_routes); + + // Captured during the initial cluster bundle fetch so the background refresh + // loop can skip no-op updates from the very first tick. + let mut initial_revision: Option = None; + + let routes = match source { + InferenceRouteSource::File => { + let Some(path) = inference_routes else { + return Ok(None); + }; + + // Standalone mode: load routes from file (fail-fast on errors) + if sandbox_id.is_some() { + ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Enabled, "loaded") + .unmapped("inference_routes", serde_json::json!(path)) + .message(format!( + "Inference routes file takes precedence over cluster bundle [path:{path}]" + )) + .build()); + } + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Other, "loading") + .unmapped("inference_routes", serde_json::json!(path)) + .message(format!("Loading inference routes from file [path:{path}]")) + .build() + ); + let config = RouterConfig::load_from_file(std::path::Path::new(path)) + .map_err(|e| miette::miette!("failed to load inference routes {path}: {e}"))?; + config + .resolve_routes() + .map_err(|e| miette::miette!("failed to resolve routes from {path}: {e}"))? + } + InferenceRouteSource::Cluster => { + let (Some(_id), Some(endpoint)) = (sandbox_id, openshell_endpoint) else { + return Ok(None); + }; + + // Cluster mode: fetch bundle from gateway + info!(endpoint = %endpoint, "Fetching inference route bundle from gateway"); + match openshell_core::grpc_client::fetch_inference_bundle(endpoint).await { + Ok(bundle) => { + initial_revision = Some(bundle.revision.clone()); + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Enabled, "loaded") + .unmapped("route_count", serde_json::json!(bundle.routes.len())) + .unmapped("revision", serde_json::json!(&bundle.revision)) + .message(format!( + "Loaded inference route bundle [route_count:{} revision:{}]", + bundle.routes.len(), + bundle.revision + )) + .build() + ); + bundle_to_resolved_routes(&bundle) + } + Err(e) => { + // Distinguish expected "not configured" states from server errors. + // gRPC PermissionDenied/NotFound means inference bundle is unavailable + // for this sandbox — skip gracefully. Other errors are unexpected. + let msg = e.to_string(); + if msg.contains("permission denied") || msg.contains("not found") { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Disabled, "disabled") + .unmapped("error", serde_json::json!(e.to_string())) + .message(format!( + "Inference bundle unavailable, routing disabled [error:{e}]" + )) + .build() + ); + return Ok(None); + } + ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .state(StateId::Disabled, "disabled") + .unmapped("error", serde_json::json!(e.to_string())) + .message(format!( + "Failed to fetch inference bundle, inference routing disabled [error:{e}]" + )) + .build()); + return Ok(None); + } + } + } + InferenceRouteSource::None => { + // No route source — inference routing is not configured + return Ok(None); + } + }; + + if routes.is_empty() && disable_inference_on_empty_routes(source) { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Disabled, "disabled") + .message("No usable inference routes, inference routing disabled") + .build() + ); + return Ok(None); + } + + if routes.is_empty() { + ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Other, "waiting") + .message("Inference route bundle is empty; keeping routing enabled and waiting for refresh") + .build()); + } + + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Enabled, "enabled") + .unmapped("route_count", serde_json::json!(routes.len())) + .message(format!( + "Inference routing enabled with local execution [route_count:{}]", + routes.len() + )) + .build() + ); + + // Partition routes by name into user-facing and system caches. + let (user_routes, system_routes) = partition_routes(routes); + + let router = + Router::new().map_err(|e| miette::miette!("failed to initialize inference router: {e}"))?; + let patterns = crate::l7::inference::default_patterns(); + + let ctx = Arc::new(crate::proxy::InferenceContext::new( + patterns, + router, + user_routes, + system_routes, + )); + + // Spawn background route cache refresh for cluster mode at startup so + // request handling never depends on control-plane latency. + if matches!(source, InferenceRouteSource::Cluster) + && let (Some(_id), Some(endpoint)) = (sandbox_id, openshell_endpoint) + { + spawn_route_refresh( + ctx.route_cache(), + ctx.system_route_cache(), + endpoint.to_string(), + route_refresh_interval_secs(), + initial_revision, + ); + } + + Ok(Some(ctx)) +} + +/// Split resolved routes into user-facing and system caches by route name. +/// +/// Routes named `"sandbox-system"` go to the system cache; everything else +/// (including `"inference.local"` and empty names) goes to the user cache. +pub fn partition_routes( + routes: Vec, +) -> ( + Vec, + Vec, +) { + let mut user = Vec::new(); + let mut system = Vec::new(); + for r in routes { + if r.name == SANDBOX_SYSTEM_ROUTE_NAME { + system.push(r); + } else { + user.push(r); + } + } + (user, system) +} + +/// Convert a proto bundle response into resolved routes for the router. +pub fn bundle_to_resolved_routes( + bundle: &openshell_core::proto::GetInferenceBundleResponse, +) -> Vec { + bundle + .routes + .iter() + .map(|r| { + let (auth, default_headers, passthrough_headers) = + openshell_core::inference::route_headers_for_provider_type(&r.provider_type); + let timeout = if r.timeout_secs == 0 { + openshell_router::config::DEFAULT_ROUTE_TIMEOUT + } else { + Duration::from_secs(r.timeout_secs) + }; + openshell_router::config::ResolvedRoute { + name: r.name.clone(), + endpoint: r.base_url.clone(), + model: r.model_id.clone(), + api_key: r.api_key.clone(), + protocols: r.protocols.clone(), + auth, + default_headers, + passthrough_headers, + timeout, + } + }) + .collect() +} + +/// Spawn a background task that periodically refreshes both route caches from the gateway. +/// +/// The loop uses the bundle `revision` hash to avoid unnecessary cache writes +/// when routes haven't changed. `initial_revision` is the revision captured +/// during the startup fetch in [`build_inference_context`] so the first refresh +/// cycle can already skip a no-op update. +pub fn spawn_route_refresh( + user_cache: Arc>>, + system_cache: Arc>>, + endpoint: String, + interval_secs: u64, + initial_revision: Option, +) { + tokio::spawn(async move { + use tokio::time::{MissedTickBehavior, interval}; + + let mut current_revision = initial_revision; + + let mut tick = interval(Duration::from_secs(interval_secs)); + tick.set_missed_tick_behavior(MissedTickBehavior::Skip); + + loop { + tick.tick().await; + + match openshell_core::grpc_client::fetch_inference_bundle(&endpoint).await { + Ok(bundle) => { + if current_revision.as_deref() == Some(&bundle.revision) { + trace!(revision = %bundle.revision, "Inference bundle unchanged"); + continue; + } + + let routes = bundle_to_resolved_routes(&bundle); + let (user_routes, system_routes) = partition_routes(routes); + ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Enabled, "updated") + .unmapped("user_route_count", serde_json::json!(user_routes.len())) + .unmapped("system_route_count", serde_json::json!(system_routes.len())) + .unmapped("revision", serde_json::json!(&bundle.revision)) + .message(format!( + "Inference routes updated [user_route_count:{} system_route_count:{} revision:{}]", + user_routes.len(), + system_routes.len(), + bundle.revision + )) + .build()); + current_revision = Some(bundle.revision); + *user_cache.write().await = user_routes; + *system_cache.write().await = system_routes; + } + Err(e) => { + ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .state(StateId::Other, "stale") + .unmapped("error", serde_json::json!(e.to_string())) + .message(format!( + "Failed to refresh inference route cache, keeping stale routes [error:{e}]" + )) + .build()); + } + } + } + }); +} + +#[cfg(test)] +#[allow( + clippy::needless_raw_string_hashes, + clippy::similar_names, + reason = "Test code: test fixtures often use idiomatic forms not flagged in production." +)] +mod tests { + use super::*; + use std::sync::{LazyLock, Mutex}; + use temp_env::with_vars; + + static ENV_LOCK: LazyLock> = LazyLock::new(|| Mutex::new(())); + + #[test] + fn bundle_to_resolved_routes_converts_all_fields() { + let bundle = openshell_core::proto::GetInferenceBundleResponse { + routes: vec![ + openshell_core::proto::ResolvedRoute { + name: "frontier".to_string(), + base_url: "https://api.example.com/v1".to_string(), + api_key: "sk-test-key".to_string(), + model_id: "gpt-4".to_string(), + protocols: vec![ + "openai_chat_completions".to_string(), + "openai_responses".to_string(), + ], + provider_type: "openai".to_string(), + timeout_secs: 0, + }, + openshell_core::proto::ResolvedRoute { + name: "local".to_string(), + base_url: "http://vllm:8000/v1".to_string(), + api_key: "local-key".to_string(), + model_id: "llama-3".to_string(), + protocols: vec!["openai_chat_completions".to_string()], + provider_type: String::new(), + timeout_secs: 120, + }, + ], + revision: "abc123".to_string(), + generated_at_ms: 1000, + }; + + let routes = bundle_to_resolved_routes(&bundle); + + assert_eq!(routes.len(), 2); + assert_eq!(routes[0].endpoint, "https://api.example.com/v1"); + assert_eq!(routes[0].model, "gpt-4"); + assert_eq!(routes[0].api_key, "sk-test-key"); + assert_eq!( + routes[0].auth, + openshell_core::inference::AuthHeader::Bearer + ); + assert_eq!( + routes[0].protocols, + vec!["openai_chat_completions", "openai_responses"] + ); + assert_eq!( + routes[0].timeout, + openshell_router::config::DEFAULT_ROUTE_TIMEOUT, + "timeout_secs=0 should map to default" + ); + assert_eq!(routes[1].endpoint, "http://vllm:8000/v1"); + assert_eq!( + routes[1].auth, + openshell_core::inference::AuthHeader::Bearer + ); + assert_eq!( + routes[1].timeout, + Duration::from_secs(120), + "timeout_secs=120 should map to 120s" + ); + } + + #[test] + fn bundle_to_resolved_routes_handles_empty_bundle() { + let bundle = openshell_core::proto::GetInferenceBundleResponse { + routes: vec![], + revision: "empty".to_string(), + generated_at_ms: 0, + }; + + let routes = bundle_to_resolved_routes(&bundle); + assert!(routes.is_empty()); + } + + #[test] + fn bundle_to_resolved_routes_preserves_name_field() { + let bundle = openshell_core::proto::GetInferenceBundleResponse { + routes: vec![openshell_core::proto::ResolvedRoute { + name: "sandbox-system".to_string(), + base_url: "https://api.example.com/v1".to_string(), + api_key: "key".to_string(), + model_id: "model".to_string(), + protocols: vec!["openai_chat_completions".to_string()], + provider_type: "openai".to_string(), + timeout_secs: 0, + }], + revision: "rev".to_string(), + generated_at_ms: 0, + }; + + let routes = bundle_to_resolved_routes(&bundle); + assert_eq!(routes[0].name, "sandbox-system"); + } + + #[test] + fn routes_segregated_by_name() { + let routes = vec![ + openshell_router::config::ResolvedRoute { + name: "inference.local".to_string(), + endpoint: "https://api.openai.com/v1".to_string(), + model: "gpt-4o".to_string(), + api_key: "key1".to_string(), + protocols: vec!["openai_chat_completions".to_string()], + auth: openshell_core::inference::AuthHeader::Bearer, + default_headers: vec![], + passthrough_headers: vec![], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, + }, + openshell_router::config::ResolvedRoute { + name: "sandbox-system".to_string(), + endpoint: "https://api.anthropic.com/v1".to_string(), + model: "claude-sonnet-4-20250514".to_string(), + api_key: "key2".to_string(), + protocols: vec!["anthropic_messages".to_string()], + auth: openshell_core::inference::AuthHeader::Custom("x-api-key"), + default_headers: vec![], + passthrough_headers: vec![], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, + }, + ]; + + let (user, system) = partition_routes(routes); + assert_eq!(user.len(), 1); + assert_eq!(user[0].name, "inference.local"); + assert_eq!(system.len(), 1); + assert_eq!(system[0].name, "sandbox-system"); + } + + // -- build_inference_context tests -- + + #[tokio::test] + async fn build_inference_context_route_file_loads_routes() { + use std::io::Write; + + let yaml = r#" +routes: + - name: inference.local + endpoint: http://localhost:8000/v1 + model: llama-3 + protocols: [openai_chat_completions] + api_key: test-key +"#; + let mut f = tempfile::NamedTempFile::new().unwrap(); + f.write_all(yaml.as_bytes()).unwrap(); + let path = f.path().to_str().unwrap(); + + let ctx = build_inference_context(None, None, Some(path)) + .await + .expect("should load routes from file"); + + let ctx = ctx.expect("context should be Some"); + let cache = ctx.route_cache(); + let routes = cache.read().await; + assert_eq!(routes.len(), 1); + assert_eq!(routes[0].endpoint, "http://localhost:8000/v1"); + } + + #[tokio::test] + async fn build_inference_context_empty_route_file_returns_none() { + use std::io::Write; + + // Route file with empty routes list → inference routing disabled (not an error) + let yaml = "routes: []\n"; + let mut f = tempfile::NamedTempFile::new().unwrap(); + f.write_all(yaml.as_bytes()).unwrap(); + let path = f.path().to_str().unwrap(); + + let ctx = build_inference_context(None, None, Some(path)) + .await + .expect("empty routes file should not error"); + assert!( + ctx.is_none(), + "empty routes should disable inference routing" + ); + } + + #[tokio::test] + async fn build_inference_context_no_sources_returns_none() { + let ctx = build_inference_context(None, None, None) + .await + .expect("should succeed with None"); + + assert!(ctx.is_none(), "no sources should return None"); + } + + #[tokio::test] + async fn build_inference_context_route_file_overrides_cluster() { + use std::io::Write; + + let yaml = r#" +routes: + - name: inference.local + endpoint: http://localhost:9999/v1 + model: file-model + protocols: [openai_chat_completions] + api_key: file-key +"#; + let mut f = tempfile::NamedTempFile::new().unwrap(); + f.write_all(yaml.as_bytes()).unwrap(); + let path = f.path().to_str().unwrap(); + + // Even with sandbox_id and endpoint, route_file takes precedence + let ctx = build_inference_context(Some("sb-1"), Some("http://localhost:50051"), Some(path)) + .await + .expect("should load from file"); + + let ctx = ctx.expect("context should be Some"); + let cache = ctx.route_cache(); + let routes = cache.read().await; + assert_eq!(routes[0].endpoint, "http://localhost:9999/v1"); + } + + #[test] + fn infer_route_source_prefers_file_mode() { + assert_eq!( + infer_route_source( + Some("sb-1"), + Some("http://localhost:50051"), + Some("routes.yaml") + ), + InferenceRouteSource::File + ); + } + + #[test] + fn infer_route_source_cluster_requires_id_and_endpoint() { + assert_eq!( + infer_route_source(Some("sb-1"), Some("http://localhost:50051"), None), + InferenceRouteSource::Cluster + ); + assert_eq!( + infer_route_source(Some("sb-1"), None, None), + InferenceRouteSource::None + ); + assert_eq!( + infer_route_source(None, Some("http://localhost:50051"), None), + InferenceRouteSource::None + ); + } + + #[test] + fn disable_inference_on_empty_routes_depends_on_source() { + assert!(disable_inference_on_empty_routes( + InferenceRouteSource::File + )); + assert!(!disable_inference_on_empty_routes( + InferenceRouteSource::Cluster + )); + assert!(disable_inference_on_empty_routes( + InferenceRouteSource::None + )); + } + + // ---- Route refresh interval + revision tests ---- + + #[test] + fn default_route_refresh_interval_is_five_seconds() { + assert_eq!(DEFAULT_ROUTE_REFRESH_INTERVAL_SECS, 5); + } + + #[test] + fn route_refresh_interval_uses_env_override() { + let _guard = ENV_LOCK.lock().unwrap(); + with_vars( + [("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS", Some("9"))], + || { + assert_eq!(route_refresh_interval_secs(), 9); + }, + ); + } + + #[test] + fn route_refresh_interval_rejects_zero() { + let _guard = ENV_LOCK.lock().unwrap(); + with_vars( + [("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS", Some("0"))], + || { + assert_eq!( + route_refresh_interval_secs(), + DEFAULT_ROUTE_REFRESH_INTERVAL_SECS + ); + }, + ); + } + + #[test] + fn route_refresh_interval_rejects_invalid_values() { + let _guard = ENV_LOCK.lock().unwrap(); + with_vars( + [("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS", Some("abc"))], + || { + assert_eq!( + route_refresh_interval_secs(), + DEFAULT_ROUTE_REFRESH_INTERVAL_SECS + ); + }, + ); + } + + #[tokio::test] + async fn route_cache_preserves_content_when_not_written() { + use std::sync::Arc; + use tokio::sync::RwLock; + + let routes = vec![openshell_router::config::ResolvedRoute { + name: "inference.local".to_string(), + endpoint: "http://original:8000/v1".to_string(), + model: "original-model".to_string(), + api_key: "key".to_string(), + auth: openshell_core::inference::AuthHeader::Bearer, + protocols: vec!["openai_chat_completions".to_string()], + default_headers: vec![], + passthrough_headers: vec![], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, + }]; + + let cache = Arc::new(RwLock::new(routes)); + + // Verify the cache preserves its content — the revision-based skip + // logic in spawn_route_refresh ensures the cache is only written + // when the revision actually changes. + let read = cache.read().await; + assert_eq!(read.len(), 1); + assert_eq!(read[0].model, "original-model"); + } +} diff --git a/crates/openshell-supervisor-networking/src/lib.rs b/crates/openshell-supervisor-networking/src/lib.rs index fead9207c..e60d2d6dd 100644 --- a/crates/openshell-supervisor-networking/src/lib.rs +++ b/crates/openshell-supervisor-networking/src/lib.rs @@ -10,6 +10,7 @@ pub mod bypass_monitor; pub mod denial_aggregator; pub mod identity; +pub mod inference_routes; pub mod l7; pub mod mechanistic_mapper; pub mod opa; From 0aefa698a1262d5e1494405293a9ca7f7c7286be Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 09:49:14 +0300 Subject: [PATCH 33/49] refactor(supervisor-networking): move run_networking from openshell-sandbox Move the Networking handle struct, run_networking, and the Linux-only create_netns_for_proxy helper into a new openshell-supervisor-networking::run module. The orchestrator in openshell-sandbox now invokes openshell_supervisor_networking::run::{create_netns_for_proxy, run_networking} and reads the Networking fields directly; the leaf owns the entire networking-stack startup path (CA generation, proxy task, bypass monitor, inference context, denial channel) end-to-end. The Networking RAII handle fields (proxy, bypass_monitor) are now public without leading underscores so the public API satisfies clippy's pub_underscore_fields lint while still serving as drop guards held by the orchestrator's frame. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 287 +--------------- .../src/lib.rs | 1 + .../src/run.rs | 320 ++++++++++++++++++ 3 files changed, 323 insertions(+), 285 deletions(-) create mode 100644 crates/openshell-supervisor-networking/src/run.rs diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 20747a599..dfcd3e91c 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -7,7 +7,6 @@ use miette::{IntoDiagnostic, Result}; use std::future::Future; -use std::net::SocketAddr; use std::sync::Arc; use std::sync::atomic::{AtomicU32, Ordering}; use std::time::Duration; @@ -57,14 +56,8 @@ pub(crate) use openshell_core::proposals::{AGENT_PROPOSALS_ENABLED, agent_propos use openshell_core::netns::NetworkNamespace; use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; use openshell_core::provider_credentials::ProviderCredentialState; -use openshell_supervisor_networking::identity::BinaryIdentityCache; -use openshell_supervisor_networking::l7::tls::{ - CertCache, ProxyTlsState, SandboxCa, build_upstream_client_config, read_system_ca_bundle, - write_ca_files, -}; use openshell_supervisor_networking::mechanistic_mapper; use openshell_supervisor_networking::opa::OpaEngine; -use openshell_supervisor_networking::proxy::ProxyHandle; pub use openshell_supervisor_process::process::{ProcessHandle, ProcessStatus}; pub use openshell_supervisor_process::sandbox::apply_supervisor_startup_hardening; use openshell_supervisor_process::skills; @@ -72,282 +65,6 @@ use openshell_supervisor_process::skills; #[cfg(target_os = "linux")] use openshell_supervisor_process::managed_children; -/// Handles and values produced by [`run_networking`] that the rest of -/// `run_sandbox` consumes. -/// -/// The two `_proxy` / `_bypass_monitor` fields are RAII handles whose drop -/// tears down the proxy and bypass-monitor tasks. They must remain alive for -/// the duration of the sandbox wait loop, which is achieved by holding the -/// returned `Networking` value in `run_sandbox`'s frame. -/// Create the workload's network namespace and install bypass detection -/// rules. Returns `None` when the policy is not in proxy mode. Linux-only. -/// -/// The namespace is shared infrastructure: the proxy binds to its host-side -/// veth IP and reads /dev/kmsg from inside it for bypass detection, while -/// the workload child and SSH sessions enter it via `setns()`. -#[cfg(target_os = "linux")] -fn create_netns_for_proxy(policy: &SandboxPolicy) -> Result> { - if !matches!(policy.network.mode, NetworkMode::Proxy) { - return Ok(None); - } - match NetworkNamespace::create() { - Ok(ns) => { - // Install bypass detection rules (nftables log + reject). - // This provides fast-fail UX and diagnostic logging for direct - // connection attempts that bypass the HTTP CONNECT proxy. - let proxy_port = policy - .network - .proxy - .as_ref() - .and_then(|p| p.http_addr) - .map_or(3128, |addr| addr.port()); - if let Err(e) = ns.install_bypass_rules(proxy_port) { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .state(StateId::Disabled, "degraded") - .message(format!( - "Failed to install bypass detection rules (non-fatal): {e}" - )) - .build() - ); - } - Ok(Some(ns)) - } - Err(e) => Err(miette::miette!( - "Network namespace creation failed and proxy mode requires isolation. \ - Ensure CAP_NET_ADMIN and CAP_SYS_ADMIN are available and iproute2 is installed. \ - Error: {e}" - )), - } -} - -struct Networking { - #[allow(dead_code, reason = "RAII handle: drop tears down the proxy task")] - _proxy: Option, - #[cfg(target_os = "linux")] - #[allow(dead_code, reason = "RAII handle: drop joins the bypass monitor task")] - _bypass_monitor: Option>, - - ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, - ssh_proxy_url: Option, - ssh_netns_fd: Option, - denial_rx: Option>, -} - -/// Set up the networking stack: ephemeral CA + TLS state, proxy server, -/// bypass monitor, and the SSH-side proxy URL / netns FD. -/// -/// The network namespace is created by `run_sandbox` and borrowed in here — -/// it is shared infrastructure used by both the proxy (bind address, bypass -/// monitor) and the workload child (entered via `setns()` in `pre_exec`). -#[allow(clippy::too_many_arguments)] -async fn run_networking( - policy: &SandboxPolicy, - #[cfg(target_os = "linux")] netns: Option<&NetworkNamespace>, - opa_engine: Option<&Arc>, - entrypoint_pid: Arc, - provider_credentials: &ProviderCredentialState, - policy_local_ctx: &Arc, - sandbox_id: Option<&str>, - openshell_endpoint: Option<&str>, - inference_routes: Option<&str>, -) -> Result { - // Identity cache for SHA256 TOFU when OPA is active. Only consumed by - // the proxy, so it's owned here. - let identity_cache = opa_engine.map(|_| Arc::new(BinaryIdentityCache::new())); - - // Generate ephemeral CA and TLS state for HTTPS L7 inspection. - // The CA cert is written to disk so sandbox processes can trust it. - let (tls_state, ca_file_paths) = if matches!(policy.network.mode, NetworkMode::Proxy) { - match SandboxCa::generate() { - Ok(ca) => { - let tls_dir = std::path::Path::new("/etc/openshell-tls"); - let system_ca_bundle = read_system_ca_bundle(); - match write_ca_files(&ca, tls_dir, &system_ca_bundle) { - Ok(paths) => { - // /etc/openshell-tls is subsumed by the /etc baseline - // path injected by enrich_*_baseline_paths(), so no - // explicit Landlock entry is needed here. - - let upstream_config = build_upstream_client_config(&system_ca_bundle); - let cert_cache = CertCache::new(ca); - let state = Arc::new(ProxyTlsState::new(cert_cache, upstream_config)); - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Enabled, "enabled") - .message("TLS termination enabled: ephemeral CA generated") - .build() - ); - (Some(state), Some(paths)) - } - Err(e) => { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .state(StateId::Disabled, "disabled") - .message(format!( - "Failed to write CA files, TLS termination disabled: {e}" - )) - .build() - ); - (None, None) - } - } - } - Err(e) => { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .state(StateId::Disabled, "disabled") - .message(format!( - "Failed to generate ephemeral CA, TLS termination disabled: {e}" - )) - .build() - ); - (None, None) - } - } - } else { - (None, None) - }; - - let (proxy_handle, denial_rx, bypass_denial_tx) = - if matches!(policy.network.mode, NetworkMode::Proxy) { - let proxy_policy = policy.network.proxy.as_ref().ok_or_else(|| { - miette::miette!( - "Network mode is set to proxy but no proxy configuration was provided" - ) - })?; - - let engine = opa_engine.cloned().ok_or_else(|| { - miette::miette!("Proxy mode requires an OPA engine (--rego-policy and --rego-data)") - })?; - - let cache = identity_cache.clone().ok_or_else(|| { - miette::miette!( - "Proxy mode requires an identity cache (OPA engine must be configured)" - ) - })?; - - // If we have a network namespace, bind to the veth host IP so sandboxed - // processes can reach the proxy via TCP. - #[cfg(target_os = "linux")] - let bind_addr = netns.map(|ns| { - let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); - SocketAddr::new(ns.host_ip(), port) - }); - - #[cfg(not(target_os = "linux"))] - let bind_addr: Option = None; - - // Build inference context for local routing of intercepted inference calls. - let inference_ctx = - openshell_supervisor_networking::inference_routes::build_inference_context( - sandbox_id, - openshell_endpoint, - inference_routes, - ) - .await?; - - // Create denial aggregator channel if in gRPC mode (sandbox_id present). - // Clone the sender for the bypass monitor before passing to the proxy. - let (denial_tx, denial_rx, bypass_denial_tx) = if sandbox_id.is_some() { - let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); - let bypass_tx = tx.clone(); - (Some(tx), Some(rx), Some(bypass_tx)) - } else { - (None, None, None) - }; - - let proxy_handle = ProxyHandle::start_with_bind_addr( - proxy_policy, - bind_addr, - engine, - cache, - entrypoint_pid.clone(), - tls_state, - inference_ctx, - Some(provider_credentials.clone()), - Some(policy_local_ctx.clone()), - denial_tx, - ) - .await?; - (Some(proxy_handle), denial_rx, bypass_denial_tx) - } else { - (None, None, None) - }; - - // Spawn bypass detection monitor (Linux only, proxy mode only). - // Reads /dev/kmsg for nftables log entries and emits structured - // tracing events for direct connection attempts that bypass the proxy. - #[cfg(target_os = "linux")] - let bypass_monitor_handle = netns.and_then(|ns| { - openshell_supervisor_networking::bypass_monitor::spawn( - ns.name().to_string(), - entrypoint_pid.clone(), - bypass_denial_tx, - ) - }); - - // On non-Linux, bypass_denial_tx is unused (no /dev/kmsg). - #[cfg(not(target_os = "linux"))] - drop(bypass_denial_tx); - - // Compute the proxy URL and netns fd for SSH sessions. - // SSH shell processes need both to enforce network policy: - // - netns_fd: enter the network namespace via setns() so all traffic - // goes through the veth pair (hard enforcement, non-bypassable) - // - proxy_url: set proxy env vars so cooperative tools route through the - // CONNECT proxy; this also opts Node.js into honoring those vars - #[cfg(target_os = "linux")] - let ssh_netns_fd = netns.and_then(NetworkNamespace::ns_fd); - - #[cfg(not(target_os = "linux"))] - let ssh_netns_fd: Option = None; - - let ssh_proxy_url = if matches!(policy.network.mode, NetworkMode::Proxy) { - #[cfg(target_os = "linux")] - { - netns.map(|ns| { - let port = policy - .network - .proxy - .as_ref() - .and_then(|p| p.http_addr) - .map_or(3128, |addr| addr.port()); - format!("http://{}:{port}", ns.host_ip()) - }) - } - #[cfg(not(target_os = "linux"))] - { - policy - .network - .proxy - .as_ref() - .and_then(|p| p.http_addr) - .map(|addr| format!("http://{addr}")) - } - } else { - None - }; - - Ok(Networking { - _proxy: proxy_handle, - #[cfg(target_os = "linux")] - _bypass_monitor: bypass_monitor_handle, - ca_file_paths, - ssh_proxy_url, - ssh_netns_fd, - denial_rx, - }) -} - /// Run a command in the sandbox. /// /// # Errors @@ -545,9 +262,9 @@ pub async fn run_sandbox( // it via setns(). The RAII handle lives in this frame for the duration // of the sandbox. #[cfg(target_os = "linux")] - let netns = create_netns_for_proxy(&policy)?; + let netns = openshell_supervisor_networking::run::create_netns_for_proxy(&policy)?; - let mut networking = run_networking( + let mut networking = openshell_supervisor_networking::run::run_networking( &policy, #[cfg(target_os = "linux")] netns.as_ref(), diff --git a/crates/openshell-supervisor-networking/src/lib.rs b/crates/openshell-supervisor-networking/src/lib.rs index e60d2d6dd..8d50e767e 100644 --- a/crates/openshell-supervisor-networking/src/lib.rs +++ b/crates/openshell-supervisor-networking/src/lib.rs @@ -16,3 +16,4 @@ pub mod mechanistic_mapper; pub mod opa; pub mod policy_local; pub mod proxy; +pub mod run; diff --git a/crates/openshell-supervisor-networking/src/run.rs b/crates/openshell-supervisor-networking/src/run.rs new file mode 100644 index 000000000..0f9db177d --- /dev/null +++ b/crates/openshell-supervisor-networking/src/run.rs @@ -0,0 +1,320 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Networking stack startup for the sandbox. +//! +//! Builds the network namespace (Linux), the CONNECT proxy with TLS L7 +//! interception, the bypass monitor, the inference context, and the +//! denial-event channel. Returns a [`Networking`] handle whose RAII fields +//! keep the proxy and bypass-monitor tasks alive for the lifetime of the +//! sandbox supervisor. + +use std::net::SocketAddr; +use std::sync::Arc; +use std::sync::atomic::AtomicU32; + +use miette::Result; + +use openshell_core::DenialEvent; +#[cfg(target_os = "linux")] +use openshell_core::netns::NetworkNamespace; +use openshell_core::policy::{NetworkMode, SandboxPolicy}; +use openshell_core::provider_credentials::ProviderCredentialState; +use openshell_ocsf::{ + ConfigStateChangeBuilder, SeverityId, StateId, StatusId, ctx::ctx as ocsf_ctx, ocsf_emit, +}; + +use crate::identity::BinaryIdentityCache; +use crate::l7::tls::{ + CertCache, ProxyTlsState, SandboxCa, build_upstream_client_config, read_system_ca_bundle, + write_ca_files, +}; +use crate::opa::OpaEngine; +use crate::policy_local::PolicyLocalContext; +use crate::proxy::ProxyHandle; + +/// Create the workload's network namespace and install bypass detection +/// rules. Returns `None` when the policy is not in proxy mode. Linux-only. +/// +/// The namespace is shared infrastructure: the proxy binds to its host-side +/// veth IP and reads /dev/kmsg from inside it for bypass detection, while +/// the workload child and SSH sessions enter it via `setns()`. +/// +/// # Errors +/// +/// Returns an error if proxy mode is requested but the namespace cannot be +/// created (e.g., missing `CAP_NET_ADMIN` / `CAP_SYS_ADMIN` or `iproute2`). +/// Failure to install nftables bypass-detection rules is non-fatal and is +/// reported via OCSF instead. +#[cfg(target_os = "linux")] +pub fn create_netns_for_proxy(policy: &SandboxPolicy) -> Result> { + if !matches!(policy.network.mode, NetworkMode::Proxy) { + return Ok(None); + } + match NetworkNamespace::create() { + Ok(ns) => { + // Install bypass detection rules (nftables log + reject). + // This provides fast-fail UX and diagnostic logging for direct + // connection attempts that bypass the HTTP CONNECT proxy. + let proxy_port = policy + .network + .proxy + .as_ref() + .and_then(|p| p.http_addr) + .map_or(3128, |addr| addr.port()); + if let Err(e) = ns.install_bypass_rules(proxy_port) { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .state(StateId::Disabled, "degraded") + .message(format!( + "Failed to install bypass detection rules (non-fatal): {e}" + )) + .build() + ); + } + Ok(Some(ns)) + } + Err(e) => Err(miette::miette!( + "Network namespace creation failed and proxy mode requires isolation. \ + Ensure CAP_NET_ADMIN and CAP_SYS_ADMIN are available and iproute2 is installed. \ + Error: {e}" + )), + } +} + +/// Handles and values produced by [`run_networking`] that the rest of +/// `run_sandbox` consumes. +/// +/// The `proxy` / `bypass_monitor` fields are RAII handles whose drop +/// tears down the proxy and bypass-monitor tasks. They must remain alive for +/// the duration of the sandbox wait loop, which is achieved by holding the +/// returned `Networking` value in `run_sandbox`'s frame. +pub struct Networking { + pub proxy: Option, + #[cfg(target_os = "linux")] + pub bypass_monitor: Option>, + + pub ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, + pub ssh_proxy_url: Option, + pub ssh_netns_fd: Option, + pub denial_rx: Option>, +} + +/// Set up the networking stack: ephemeral CA + TLS state, proxy server, +/// bypass monitor, and the SSH-side proxy URL / netns FD. +/// +/// The network namespace is created by `run_sandbox` and borrowed in here — +/// it is shared infrastructure used by both the proxy (bind address, bypass +/// monitor) and the workload child (entered via `setns()` in `pre_exec`). +/// +/// # Errors +/// +/// Returns an error if proxy mode is requested but the proxy configuration, +/// OPA engine, or identity cache is missing, if inference route resolution +/// fails, or if the proxy server fails to start. +#[allow(clippy::too_many_arguments)] +pub async fn run_networking( + policy: &SandboxPolicy, + #[cfg(target_os = "linux")] netns: Option<&NetworkNamespace>, + opa_engine: Option<&Arc>, + entrypoint_pid: Arc, + provider_credentials: &ProviderCredentialState, + policy_local_ctx: &Arc, + sandbox_id: Option<&str>, + openshell_endpoint: Option<&str>, + inference_routes: Option<&str>, +) -> Result { + // Identity cache for SHA256 TOFU when OPA is active. Only consumed by + // the proxy, so it's owned here. + let identity_cache = opa_engine.map(|_| Arc::new(BinaryIdentityCache::new())); + + // Generate ephemeral CA and TLS state for HTTPS L7 inspection. + // The CA cert is written to disk so sandbox processes can trust it. + let (tls_state, ca_file_paths) = if matches!(policy.network.mode, NetworkMode::Proxy) { + match SandboxCa::generate() { + Ok(ca) => { + let tls_dir = std::path::Path::new("/etc/openshell-tls"); + let system_ca_bundle = read_system_ca_bundle(); + match write_ca_files(&ca, tls_dir, &system_ca_bundle) { + Ok(paths) => { + // /etc/openshell-tls is subsumed by the /etc baseline + // path injected by enrich_*_baseline_paths(), so no + // explicit Landlock entry is needed here. + + let upstream_config = build_upstream_client_config(&system_ca_bundle); + let cert_cache = CertCache::new(ca); + let state = Arc::new(ProxyTlsState::new(cert_cache, upstream_config)); + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Enabled, "enabled") + .message("TLS termination enabled: ephemeral CA generated") + .build() + ); + (Some(state), Some(paths)) + } + Err(e) => { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .state(StateId::Disabled, "disabled") + .message(format!( + "Failed to write CA files, TLS termination disabled: {e}" + )) + .build() + ); + (None, None) + } + } + } + Err(e) => { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .state(StateId::Disabled, "disabled") + .message(format!( + "Failed to generate ephemeral CA, TLS termination disabled: {e}" + )) + .build() + ); + (None, None) + } + } + } else { + (None, None) + }; + + let (proxy_handle, denial_rx, bypass_denial_tx) = + if matches!(policy.network.mode, NetworkMode::Proxy) { + let proxy_policy = policy.network.proxy.as_ref().ok_or_else(|| { + miette::miette!( + "Network mode is set to proxy but no proxy configuration was provided" + ) + })?; + + let engine = opa_engine.cloned().ok_or_else(|| { + miette::miette!("Proxy mode requires an OPA engine (--rego-policy and --rego-data)") + })?; + + let cache = identity_cache.clone().ok_or_else(|| { + miette::miette!( + "Proxy mode requires an identity cache (OPA engine must be configured)" + ) + })?; + + // If we have a network namespace, bind to the veth host IP so sandboxed + // processes can reach the proxy via TCP. + #[cfg(target_os = "linux")] + let bind_addr = netns.map(|ns| { + let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); + SocketAddr::new(ns.host_ip(), port) + }); + + #[cfg(not(target_os = "linux"))] + let bind_addr: Option = None; + + // Build inference context for local routing of intercepted inference calls. + let inference_ctx = crate::inference_routes::build_inference_context( + sandbox_id, + openshell_endpoint, + inference_routes, + ) + .await?; + + // Create denial aggregator channel if in gRPC mode (sandbox_id present). + // Clone the sender for the bypass monitor before passing to the proxy. + let (denial_tx, denial_rx, bypass_denial_tx) = if sandbox_id.is_some() { + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + let bypass_tx = tx.clone(); + (Some(tx), Some(rx), Some(bypass_tx)) + } else { + (None, None, None) + }; + + let proxy_handle = ProxyHandle::start_with_bind_addr( + proxy_policy, + bind_addr, + engine, + cache, + entrypoint_pid.clone(), + tls_state, + inference_ctx, + Some(provider_credentials.clone()), + Some(policy_local_ctx.clone()), + denial_tx, + ) + .await?; + (Some(proxy_handle), denial_rx, bypass_denial_tx) + } else { + (None, None, None) + }; + + // Spawn bypass detection monitor (Linux only, proxy mode only). + // Reads /dev/kmsg for nftables log entries and emits structured + // tracing events for direct connection attempts that bypass the proxy. + #[cfg(target_os = "linux")] + let bypass_monitor_handle = netns.and_then(|ns| { + crate::bypass_monitor::spawn( + ns.name().to_string(), + entrypoint_pid.clone(), + bypass_denial_tx, + ) + }); + + // On non-Linux, bypass_denial_tx is unused (no /dev/kmsg). + #[cfg(not(target_os = "linux"))] + drop(bypass_denial_tx); + + // Compute the proxy URL and netns fd for SSH sessions. + // SSH shell processes need both to enforce network policy: + // - netns_fd: enter the network namespace via setns() so all traffic + // goes through the veth pair (hard enforcement, non-bypassable) + // - proxy_url: set proxy env vars so cooperative tools route through the + // CONNECT proxy; this also opts Node.js into honoring those vars + #[cfg(target_os = "linux")] + let ssh_netns_fd = netns.and_then(NetworkNamespace::ns_fd); + + #[cfg(not(target_os = "linux"))] + let ssh_netns_fd: Option = None; + + let ssh_proxy_url = if matches!(policy.network.mode, NetworkMode::Proxy) { + #[cfg(target_os = "linux")] + { + netns.map(|ns| { + let port = policy + .network + .proxy + .as_ref() + .and_then(|p| p.http_addr) + .map_or(3128, |addr| addr.port()); + format!("http://{}:{port}", ns.host_ip()) + }) + } + #[cfg(not(target_os = "linux"))] + { + policy + .network + .proxy + .as_ref() + .and_then(|p| p.http_addr) + .map(|addr| format!("http://{addr}")) + } + } else { + None + }; + + Ok(Networking { + proxy: proxy_handle, + #[cfg(target_os = "linux")] + bypass_monitor: bypass_monitor_handle, + ca_file_paths, + ssh_proxy_url, + ssh_netns_fd, + denial_rx, + }) +} From dd653742b233731267596977bc62db38bec4eda2 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 11:19:37 +0300 Subject: [PATCH 34/49] fix(workspace): align Cargo deps and call sites for split crates The recent module lifts left two Linux-only gaps that the macOS host workspace check skipped: - openshell-core's netns module needs libc, tempfile, and nix on Linux, but only openshell-ocsf and uuid were carried over. - openshell-supervisor-process's seccomp/landlock modules need landlock and seccompiler, which still lived on openshell-sandbox. - openshell-sandbox's runtime_pid_limit branch referenced an unqualified process:: that pointed at the old in-crate module. Move landlock/seccompiler to supervisor-process, add the missing core deps, qualify the call sites, and drop sandbox deps that no longer have runtime users (landlock, seccompiler, target-gated tempfile/uuid, the unix libc/rustix block). Signed-off-by: Radoslav Hubenov --- Cargo.lock | 8 ++++---- crates/openshell-core/Cargo.toml | 3 +++ crates/openshell-sandbox/Cargo.toml | 10 ---------- crates/openshell-sandbox/src/lib.rs | 11 +++-------- crates/openshell-supervisor-process/Cargo.toml | 4 ++++ 5 files changed, 14 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0d416c0f2..4882f0dbf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3460,7 +3460,9 @@ dependencies = [ "base64 0.22.1", "hex", "ipnet", + "libc", "miette", + "nix", "openshell-ocsf", "prost", "prost-types", @@ -3655,8 +3657,6 @@ dependencies = [ "hex", "hmac", "ipnet", - "landlock", - "libc", "miette", "nix", "openshell-core", @@ -3669,10 +3669,8 @@ dependencies = [ "rcgen", "regorus", "russh", - "rustix 1.1.4", "rustls", "rustls-pemfile", - "seccompiler", "serde", "serde_json", "serde_yml", @@ -3817,6 +3815,7 @@ dependencies = [ "anyhow", "base64 0.22.1", "hex", + "landlock", "libc", "miette", "nix", @@ -3825,6 +3824,7 @@ dependencies = [ "rand_core 0.6.4", "russh", "rustix 1.1.4", + "seccompiler", "serde_json", "sha2 0.10.9", "tempfile", diff --git a/crates/openshell-core/Cargo.toml b/crates/openshell-core/Cargo.toml index d0b13ff67..cb94fa8de 100644 --- a/crates/openshell-core/Cargo.toml +++ b/crates/openshell-core/Cargo.toml @@ -29,6 +29,9 @@ base64 = { workspace = true } [target.'cfg(target_os = "linux")'.dependencies] openshell-ocsf = { path = "../openshell-ocsf" } uuid = { workspace = true } +libc = "0.2" +tempfile = "3" +nix = { workspace = true } [features] ## Include test-only settings (dummy_bool, dummy_int) in the registry. diff --git a/crates/openshell-sandbox/Cargo.toml b/crates/openshell-sandbox/Cargo.toml index 4d33ec47c..79c5c0b47 100644 --- a/crates/openshell-sandbox/Cargo.toml +++ b/crates/openshell-sandbox/Cargo.toml @@ -81,16 +81,6 @@ tracing-appender = { workspace = true } # Unix/Process nix = { workspace = true } -[target.'cfg(unix)'.dependencies] -libc = "0.2" -rustix = { workspace = true } - -[target.'cfg(target_os = "linux")'.dependencies] -landlock = "0.4" -seccompiler = "0.5" -tempfile = "3" -uuid = { version = "1", features = ["v4"] } - [dev-dependencies] openshell-core = { path = "../openshell-core", features = ["test-helpers"] } tempfile = "3" diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index dfcd3e91c..fcc189f4c 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -52,8 +52,6 @@ pub(crate) use openshell_ocsf::ctx::ctx as ocsf_ctx; /// guard (see `policy_local::tests::ProposalsFlagGuard`). pub(crate) use openshell_core::proposals::{AGENT_PROPOSALS_ENABLED, agent_proposals_enabled}; -#[cfg(target_os = "linux")] -use openshell_core::netns::NetworkNamespace; use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; use openshell_core::provider_credentials::ProviderCredentialState; use openshell_supervisor_networking::mechanistic_mapper; @@ -62,9 +60,6 @@ pub use openshell_supervisor_process::process::{ProcessHandle, ProcessStatus}; pub use openshell_supervisor_process::sandbox::apply_supervisor_startup_hardening; use openshell_supervisor_process::skills; -#[cfg(target_os = "linux")] -use openshell_supervisor_process::managed_children; - /// Run a command in the sandbox. /// /// # Errors @@ -201,11 +196,11 @@ pub async fn run_sandbox( #[cfg(target_os = "linux")] { let pid_limit_mode = if std::env::var_os("OPENSHELL_REQUIRE_RUNTIME_PID_LIMIT").is_some() { - process::RuntimePidLimitMode::Require + openshell_supervisor_process::process::RuntimePidLimitMode::Require } else { - process::RuntimePidLimitMode::Warn + openshell_supervisor_process::process::RuntimePidLimitMode::Warn }; - process::check_runtime_pid_limit(pid_limit_mode)?; + openshell_supervisor_process::process::check_runtime_pid_limit(pid_limit_mode)?; } // Initialize the agent-proposals feature flag. Default false until the diff --git a/crates/openshell-supervisor-process/Cargo.toml b/crates/openshell-supervisor-process/Cargo.toml index ec1520e42..9b8f50b2a 100644 --- a/crates/openshell-supervisor-process/Cargo.toml +++ b/crates/openshell-supervisor-process/Cargo.toml @@ -34,6 +34,10 @@ uuid = { workspace = true } libc = "0.2" rustix = { workspace = true } +[target.'cfg(target_os = "linux")'.dependencies] +landlock = "0.4" +seccompiler = "0.5" + [dev-dependencies] tempfile = "3" From b1fd66382a734ece2857fbfde495ce4549ba4f3b Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 12:46:29 +0300 Subject: [PATCH 35/49] refactor(supervisor-network): rename openshell-supervisor-networking to openshell-supervisor-network Signed-off-by: Radoslav Hubenov --- Cargo.lock | 4 ++-- crates/openshell-core/Cargo.toml | 2 +- crates/openshell-sandbox/Cargo.toml | 2 +- crates/openshell-sandbox/src/lib.rs | 24 +++++++++---------- .../Cargo.toml | 4 ++-- .../data/sandbox-policy.rego | 0 .../src/bypass_monitor.rs | 0 .../src/denial_aggregator.rs | 0 .../src/identity.rs | 0 .../src/inference_routes.rs | 0 .../src/l7/graphql.rs | 0 .../src/l7/inference.rs | 0 .../src/l7/mod.rs | 0 .../src/l7/path.rs | 0 .../src/l7/provider.rs | 0 .../src/l7/relay.rs | 0 .../src/l7/rest.rs | 0 .../src/l7/tls.rs | 0 .../src/l7/websocket.rs | 0 .../src/lib.rs | 0 .../src/mechanistic_mapper.rs | 0 .../src/opa.rs | 0 .../src/policy_local.rs | 0 .../src/proxy.rs | 0 .../src/run.rs | 0 .../testdata/sandbox-policy.yaml | 0 .../tests/system_inference.rs | 8 +++---- .../tests/websocket_upgrade.rs | 6 ++--- 28 files changed, 23 insertions(+), 27 deletions(-) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/Cargo.toml (89%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/data/sandbox-policy.rego (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/bypass_monitor.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/denial_aggregator.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/identity.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/inference_routes.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/l7/graphql.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/l7/inference.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/l7/mod.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/l7/path.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/l7/provider.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/l7/relay.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/l7/rest.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/l7/tls.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/l7/websocket.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/lib.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/mechanistic_mapper.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/opa.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/policy_local.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/proxy.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/src/run.rs (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/testdata/sandbox-policy.yaml (100%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/tests/system_inference.rs (94%) rename crates/{openshell-supervisor-networking => openshell-supervisor-network}/tests/websocket_upgrade.rs (98%) diff --git a/Cargo.lock b/Cargo.lock index 4882f0dbf..bcec1f8b2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3663,7 +3663,7 @@ dependencies = [ "openshell-ocsf", "openshell-policy", "openshell-router", - "openshell-supervisor-networking", + "openshell-supervisor-network", "openshell-supervisor-process", "rand_core 0.6.4", "rcgen", @@ -3771,7 +3771,7 @@ dependencies = [ ] [[package]] -name = "openshell-supervisor-networking" +name = "openshell-supervisor-network" version = "0.0.0" dependencies = [ "apollo-parser", diff --git a/crates/openshell-core/Cargo.toml b/crates/openshell-core/Cargo.toml index cb94fa8de..984ffcf06 100644 --- a/crates/openshell-core/Cargo.toml +++ b/crates/openshell-core/Cargo.toml @@ -40,7 +40,7 @@ nix = { workspace = true } dev-settings = [] ## Expose proposals::test_helpers (`ProposalsFlagGuard`) to downstream test ## code in other crates. Enabled by openshell-sandbox and -## openshell-supervisor-networking dev builds. +## openshell-supervisor-network dev builds. test-helpers = [] [build-dependencies] diff --git a/crates/openshell-sandbox/Cargo.toml b/crates/openshell-sandbox/Cargo.toml index 79c5c0b47..aacfb528a 100644 --- a/crates/openshell-sandbox/Cargo.toml +++ b/crates/openshell-sandbox/Cargo.toml @@ -19,7 +19,7 @@ openshell-core = { path = "../openshell-core" } openshell-ocsf = { path = "../openshell-ocsf" } openshell-policy = { path = "../openshell-policy" } openshell-router = { path = "../openshell-router" } -openshell-supervisor-networking = { path = "../openshell-supervisor-networking" } +openshell-supervisor-network = { path = "../openshell-supervisor-network" } openshell-supervisor-process = { path = "../openshell-supervisor-process" } # Async runtime diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index fcc189f4c..1f39c34cc 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -54,8 +54,8 @@ pub(crate) use openshell_core::proposals::{AGENT_PROPOSALS_ENABLED, agent_propos use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; use openshell_core::provider_credentials::ProviderCredentialState; -use openshell_supervisor_networking::mechanistic_mapper; -use openshell_supervisor_networking::opa::OpaEngine; +use openshell_supervisor_network::mechanistic_mapper; +use openshell_supervisor_network::opa::OpaEngine; pub use openshell_supervisor_process::process::{ProcessHandle, ProcessStatus}; pub use openshell_supervisor_process::sandbox::apply_supervisor_startup_hardening; use openshell_supervisor_process::skills; @@ -121,7 +121,7 @@ pub async fn run_sandbox( ) .await?; let policy_local_ctx = Arc::new( - openshell_supervisor_networking::policy_local::PolicyLocalContext::new( + openshell_supervisor_network::policy_local::PolicyLocalContext::new( retained_proto.clone(), openshell_endpoint.clone(), sandbox_name_for_agg.clone().or_else(|| sandbox_id.clone()), @@ -257,9 +257,9 @@ pub async fn run_sandbox( // it via setns(). The RAII handle lives in this frame for the duration // of the sandbox. #[cfg(target_os = "linux")] - let netns = openshell_supervisor_networking::run::create_netns_for_proxy(&policy)?; + let netns = openshell_supervisor_network::run::create_netns_for_proxy(&policy)?; - let mut networking = openshell_supervisor_networking::run::run_networking( + let mut networking = openshell_supervisor_network::run::run_networking( &policy, #[cfg(target_os = "linux")] netns.as_ref(), @@ -390,11 +390,10 @@ pub async fn run_sandbox( .and_then(|v| v.parse().ok()) .unwrap_or(10); - let aggregator = - openshell_supervisor_networking::denial_aggregator::DenialAggregator::new( - rx, - flush_interval_secs, - ); + let aggregator = openshell_supervisor_network::denial_aggregator::DenialAggregator::new( + rx, + flush_interval_secs, + ); tokio::spawn(async move { aggregator @@ -1399,7 +1398,7 @@ fn prepare_filesystem(_policy: &SandboxPolicy) -> Result<()> { async fn flush_proposals_to_gateway( endpoint: &str, sandbox_name: &str, - summaries: Vec, + summaries: Vec, ) -> Result<()> { use openshell_core::grpc_client::CachedOpenShellClient; use openshell_core::proto::{DenialSummary, L7RequestSample}; @@ -1471,8 +1470,7 @@ struct PolicyPollLoopContext { interval_secs: u64, ocsf_enabled: Arc, provider_credentials: ProviderCredentialState, - policy_local_ctx: - Option>, + policy_local_ctx: Option>, } async fn run_policy_poll_loop(ctx: PolicyPollLoopContext) -> Result<()> { diff --git a/crates/openshell-supervisor-networking/Cargo.toml b/crates/openshell-supervisor-network/Cargo.toml similarity index 89% rename from crates/openshell-supervisor-networking/Cargo.toml rename to crates/openshell-supervisor-network/Cargo.toml index 26d4c7ec0..0eca09d1a 100644 --- a/crates/openshell-supervisor-networking/Cargo.toml +++ b/crates/openshell-supervisor-network/Cargo.toml @@ -2,8 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 [package] -name = "openshell-supervisor-networking" -description = "Networking component of the OpenShell supervisor: proxy, L7 enforcement, OPA, inference routing, denial aggregator" +name = "openshell-supervisor-network" +description = "Network component of the OpenShell supervisor: proxy, L7 enforcement, OPA, inference routing, denial aggregator" version.workspace = true edition.workspace = true license.workspace = true diff --git a/crates/openshell-supervisor-networking/data/sandbox-policy.rego b/crates/openshell-supervisor-network/data/sandbox-policy.rego similarity index 100% rename from crates/openshell-supervisor-networking/data/sandbox-policy.rego rename to crates/openshell-supervisor-network/data/sandbox-policy.rego diff --git a/crates/openshell-supervisor-networking/src/bypass_monitor.rs b/crates/openshell-supervisor-network/src/bypass_monitor.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/bypass_monitor.rs rename to crates/openshell-supervisor-network/src/bypass_monitor.rs diff --git a/crates/openshell-supervisor-networking/src/denial_aggregator.rs b/crates/openshell-supervisor-network/src/denial_aggregator.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/denial_aggregator.rs rename to crates/openshell-supervisor-network/src/denial_aggregator.rs diff --git a/crates/openshell-supervisor-networking/src/identity.rs b/crates/openshell-supervisor-network/src/identity.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/identity.rs rename to crates/openshell-supervisor-network/src/identity.rs diff --git a/crates/openshell-supervisor-networking/src/inference_routes.rs b/crates/openshell-supervisor-network/src/inference_routes.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/inference_routes.rs rename to crates/openshell-supervisor-network/src/inference_routes.rs diff --git a/crates/openshell-supervisor-networking/src/l7/graphql.rs b/crates/openshell-supervisor-network/src/l7/graphql.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/l7/graphql.rs rename to crates/openshell-supervisor-network/src/l7/graphql.rs diff --git a/crates/openshell-supervisor-networking/src/l7/inference.rs b/crates/openshell-supervisor-network/src/l7/inference.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/l7/inference.rs rename to crates/openshell-supervisor-network/src/l7/inference.rs diff --git a/crates/openshell-supervisor-networking/src/l7/mod.rs b/crates/openshell-supervisor-network/src/l7/mod.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/l7/mod.rs rename to crates/openshell-supervisor-network/src/l7/mod.rs diff --git a/crates/openshell-supervisor-networking/src/l7/path.rs b/crates/openshell-supervisor-network/src/l7/path.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/l7/path.rs rename to crates/openshell-supervisor-network/src/l7/path.rs diff --git a/crates/openshell-supervisor-networking/src/l7/provider.rs b/crates/openshell-supervisor-network/src/l7/provider.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/l7/provider.rs rename to crates/openshell-supervisor-network/src/l7/provider.rs diff --git a/crates/openshell-supervisor-networking/src/l7/relay.rs b/crates/openshell-supervisor-network/src/l7/relay.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/l7/relay.rs rename to crates/openshell-supervisor-network/src/l7/relay.rs diff --git a/crates/openshell-supervisor-networking/src/l7/rest.rs b/crates/openshell-supervisor-network/src/l7/rest.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/l7/rest.rs rename to crates/openshell-supervisor-network/src/l7/rest.rs diff --git a/crates/openshell-supervisor-networking/src/l7/tls.rs b/crates/openshell-supervisor-network/src/l7/tls.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/l7/tls.rs rename to crates/openshell-supervisor-network/src/l7/tls.rs diff --git a/crates/openshell-supervisor-networking/src/l7/websocket.rs b/crates/openshell-supervisor-network/src/l7/websocket.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/l7/websocket.rs rename to crates/openshell-supervisor-network/src/l7/websocket.rs diff --git a/crates/openshell-supervisor-networking/src/lib.rs b/crates/openshell-supervisor-network/src/lib.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/lib.rs rename to crates/openshell-supervisor-network/src/lib.rs diff --git a/crates/openshell-supervisor-networking/src/mechanistic_mapper.rs b/crates/openshell-supervisor-network/src/mechanistic_mapper.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/mechanistic_mapper.rs rename to crates/openshell-supervisor-network/src/mechanistic_mapper.rs diff --git a/crates/openshell-supervisor-networking/src/opa.rs b/crates/openshell-supervisor-network/src/opa.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/opa.rs rename to crates/openshell-supervisor-network/src/opa.rs diff --git a/crates/openshell-supervisor-networking/src/policy_local.rs b/crates/openshell-supervisor-network/src/policy_local.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/policy_local.rs rename to crates/openshell-supervisor-network/src/policy_local.rs diff --git a/crates/openshell-supervisor-networking/src/proxy.rs b/crates/openshell-supervisor-network/src/proxy.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/proxy.rs rename to crates/openshell-supervisor-network/src/proxy.rs diff --git a/crates/openshell-supervisor-networking/src/run.rs b/crates/openshell-supervisor-network/src/run.rs similarity index 100% rename from crates/openshell-supervisor-networking/src/run.rs rename to crates/openshell-supervisor-network/src/run.rs diff --git a/crates/openshell-supervisor-networking/testdata/sandbox-policy.yaml b/crates/openshell-supervisor-network/testdata/sandbox-policy.yaml similarity index 100% rename from crates/openshell-supervisor-networking/testdata/sandbox-policy.yaml rename to crates/openshell-supervisor-network/testdata/sandbox-policy.yaml diff --git a/crates/openshell-supervisor-networking/tests/system_inference.rs b/crates/openshell-supervisor-network/tests/system_inference.rs similarity index 94% rename from crates/openshell-supervisor-networking/tests/system_inference.rs rename to crates/openshell-supervisor-network/tests/system_inference.rs index 324240c0a..ef1b5f54d 100644 --- a/crates/openshell-supervisor-networking/tests/system_inference.rs +++ b/crates/openshell-supervisor-network/tests/system_inference.rs @@ -9,7 +9,7 @@ use openshell_router::Router; use openshell_router::config::{AuthHeader, ResolvedRoute}; -use openshell_supervisor_networking::proxy::InferenceContext; +use openshell_supervisor_network::proxy::InferenceContext; fn make_system_route() -> ResolvedRoute { ResolvedRoute { @@ -42,7 +42,7 @@ fn make_user_route() -> ResolvedRoute { #[tokio::test] async fn system_inference_routes_to_mock_backend() { let router = Router::new().unwrap(); - let patterns = openshell_supervisor_networking::l7::inference::default_patterns(); + let patterns = openshell_supervisor_network::l7::inference::default_patterns(); let ctx = InferenceContext::new( patterns, @@ -86,7 +86,7 @@ async fn system_inference_routes_to_mock_backend() { #[tokio::test] async fn system_inference_uses_system_routes_not_user_routes() { let router = Router::new().unwrap(); - let patterns = openshell_supervisor_networking::l7::inference::default_patterns(); + let patterns = openshell_supervisor_network::l7::inference::default_patterns(); // Only user routes configured — no system routes let ctx = InferenceContext::new(patterns, router, vec![make_user_route()], vec![]); @@ -118,7 +118,7 @@ async fn system_inference_uses_system_routes_not_user_routes() { #[tokio::test] async fn system_inference_with_anthropic_protocol() { let router = Router::new().unwrap(); - let patterns = openshell_supervisor_networking::l7::inference::default_patterns(); + let patterns = openshell_supervisor_network::l7::inference::default_patterns(); let system_route = ResolvedRoute { name: "sandbox-system".to_string(), diff --git a/crates/openshell-supervisor-networking/tests/websocket_upgrade.rs b/crates/openshell-supervisor-network/tests/websocket_upgrade.rs similarity index 98% rename from crates/openshell-supervisor-networking/tests/websocket_upgrade.rs rename to crates/openshell-supervisor-network/tests/websocket_upgrade.rs index dfeefc0f6..322d6709c 100644 --- a/crates/openshell-supervisor-networking/tests/websocket_upgrade.rs +++ b/crates/openshell-supervisor-network/tests/websocket_upgrade.rs @@ -26,10 +26,8 @@ use futures::SinkExt; use futures::stream::StreamExt; -use openshell_supervisor_networking::l7::provider::{ - BodyLength, L7Provider, L7Request, RelayOutcome, -}; -use openshell_supervisor_networking::l7::rest::RestProvider; +use openshell_supervisor_network::l7::provider::{BodyLength, L7Provider, L7Request, RelayOutcome}; +use openshell_supervisor_network::l7::rest::RestProvider; use std::collections::HashMap; use std::net::SocketAddr; use tokio::io::{AsyncReadExt, AsyncWriteExt}; From 315d0b26f445c6829849c3e7c8cb226ab5d7814f Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 14:24:20 +0300 Subject: [PATCH 36/49] refactor(supervisor-network): own denial-aggregator flush end-to-end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the denial-aggregator spawn and flush_proposals_to_gateway out of run_sandbox and into run_networking. The networking leaf already owns every other input (proxy + bypass_monitor as producers, denial channel, mechanistic_mapper, denial_aggregator) and already opens its own gRPC connections (inference_routes, policy_local) — the orchestrator was the only piece left straddling the boundary. Networking now drives the full path: producers -> channel -> aggregator -> flush -> gateway. Drops denial_rx from Networking; adds sandbox_name to run_networking so SubmitPolicyAnalysis can resolve by sandbox name (falls back to ID when unset). Same shape as log_push in the process leaf. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 100 +---------------- .../openshell-supervisor-network/src/run.rs | 104 +++++++++++++++++- 2 files changed, 102 insertions(+), 102 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 1f39c34cc..e41deb6a0 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -54,7 +54,6 @@ pub(crate) use openshell_core::proposals::{AGENT_PROPOSALS_ENABLED, agent_propos use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; use openshell_core::provider_credentials::ProviderCredentialState; -use openshell_supervisor_network::mechanistic_mapper; use openshell_supervisor_network::opa::OpaEngine; pub use openshell_supervisor_process::process::{ProcessHandle, ProcessStatus}; pub use openshell_supervisor_process::sandbox::apply_supervisor_startup_hardening; @@ -268,6 +267,7 @@ pub async fn run_sandbox( &provider_credentials, &policy_local_ctx, sandbox_id.as_deref(), + sandbox_name_for_agg.as_deref(), openshell_endpoint_for_proxy.as_deref(), inference_routes.as_deref(), ) @@ -377,41 +377,6 @@ pub async fn run_sandbox( ); } }); - - // Spawn denial aggregator (gRPC mode only, when proxy is active). - if let Some(rx) = networking.denial_rx.take() { - // SubmitPolicyAnalysis resolves by sandbox *name*, not UUID. - let agg_name = sandbox_name_for_agg - .as_deref() - .map_or_else(|| id.to_string(), str::to_string); - let agg_endpoint = endpoint.to_string(); - let flush_interval_secs: u64 = std::env::var("OPENSHELL_DENIAL_FLUSH_INTERVAL_SECS") - .ok() - .and_then(|v| v.parse().ok()) - .unwrap_or(10); - - let aggregator = openshell_supervisor_network::denial_aggregator::DenialAggregator::new( - rx, - flush_interval_secs, - ); - - tokio::spawn(async move { - aggregator - .run(|summaries| { - let endpoint = agg_endpoint.clone(); - let sandbox_name = agg_name.clone(); - async move { - if let Err(e) = - flush_proposals_to_gateway(&endpoint, &sandbox_name, summaries) - .await - { - warn!(error = %e, "Failed to flush denial summaries to gateway"); - } - } - }) - .await; - }); - } } let exit_code = openshell_supervisor_process::run::run_process( @@ -1394,69 +1359,6 @@ fn prepare_filesystem(_policy: &SandboxPolicy) -> Result<()> { /// Background loop that polls the server for policy updates. /// /// When a new version is detected, attempts to reload the OPA engine via -/// Flush aggregated denial summaries to the gateway via `SubmitPolicyAnalysis`. -async fn flush_proposals_to_gateway( - endpoint: &str, - sandbox_name: &str, - summaries: Vec, -) -> Result<()> { - use openshell_core::grpc_client::CachedOpenShellClient; - use openshell_core::proto::{DenialSummary, L7RequestSample}; - - let client = CachedOpenShellClient::connect(endpoint).await?; - - // Convert FlushableDenialSummary to proto DenialSummary. - let proto_summaries: Vec = summaries - .into_iter() - .map(|s| DenialSummary { - sandbox_id: String::new(), - host: s.host, - port: u32::from(s.port), - binary: s.binary, - ancestors: s.ancestors, - deny_reason: s.deny_reason, - first_seen_ms: s.first_seen_ms, - last_seen_ms: s.last_seen_ms, - count: s.count, - suppressed_count: 0, - total_count: s.count, - sample_cmdlines: s.sample_cmdlines, - binary_sha256: String::new(), - persistent: false, - denial_stage: s.denial_stage, - l7_request_samples: s - .l7_samples - .into_iter() - .map(|l| L7RequestSample { - method: l.method, - path: l.path, - decision: "deny".to_string(), - count: l.count, - }) - .collect(), - l7_inspection_active: false, - }) - .collect(); - - // Run the mechanistic mapper sandbox-side to generate proposals. - // The gateway is a thin persistence + validation layer — it never - // generates proposals itself. - let proposals = mechanistic_mapper::generate_proposals(&proto_summaries); - - info!( - sandbox_name = %sandbox_name, - summaries = proto_summaries.len(), - proposals = proposals.len(), - "Flushed denial analysis to gateway" - ); - - client - .submit_policy_analysis(sandbox_name, proto_summaries, proposals, "mechanistic") - .await?; - - Ok(()) -} - /// `reload_from_proto_with_pid()`. Reports load success/failure back to the /// server. On failure, the previous engine is untouched (LKG behavior). /// diff --git a/crates/openshell-supervisor-network/src/run.rs b/crates/openshell-supervisor-network/src/run.rs index 0f9db177d..2f14459de 100644 --- a/crates/openshell-supervisor-network/src/run.rs +++ b/crates/openshell-supervisor-network/src/run.rs @@ -14,8 +14,8 @@ use std::sync::Arc; use std::sync::atomic::AtomicU32; use miette::Result; +use tracing::{info, warn}; -use openshell_core::DenialEvent; #[cfg(target_os = "linux")] use openshell_core::netns::NetworkNamespace; use openshell_core::policy::{NetworkMode, SandboxPolicy}; @@ -24,11 +24,13 @@ use openshell_ocsf::{ ConfigStateChangeBuilder, SeverityId, StateId, StatusId, ctx::ctx as ocsf_ctx, ocsf_emit, }; +use crate::denial_aggregator::{DenialAggregator, FlushableDenialSummary}; use crate::identity::BinaryIdentityCache; use crate::l7::tls::{ CertCache, ProxyTlsState, SandboxCa, build_upstream_client_config, read_system_ca_bundle, write_ca_files, }; +use crate::mechanistic_mapper; use crate::opa::OpaEngine; use crate::policy_local::PolicyLocalContext; use crate::proxy::ProxyHandle; @@ -99,7 +101,6 @@ pub struct Networking { pub ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, pub ssh_proxy_url: Option, pub ssh_netns_fd: Option, - pub denial_rx: Option>, } /// Set up the networking stack: ephemeral CA + TLS state, proxy server, @@ -123,6 +124,7 @@ pub async fn run_networking( provider_credentials: &ProviderCredentialState, policy_local_ctx: &Arc, sandbox_id: Option<&str>, + sandbox_name: Option<&str>, openshell_endpoint: Option<&str>, inference_routes: Option<&str>, ) -> Result { @@ -254,6 +256,41 @@ pub async fn run_networking( (None, None, None) }; + // Spawn the denial-aggregator flush task. The aggregator drains denial + // events from the proxy + bypass monitor, batches them, and ships + // summaries to the gateway via SubmitPolicyAnalysis. + if let (Some(rx), Some(endpoint)) = (denial_rx, openshell_endpoint) { + // SubmitPolicyAnalysis resolves by sandbox *name*, not UUID — fall back + // to the ID when the name isn't set. + let agg_name = sandbox_name + .map(str::to_string) + .or_else(|| sandbox_id.map(str::to_string)) + .unwrap_or_default(); + let agg_endpoint = endpoint.to_string(); + let flush_interval_secs: u64 = std::env::var("OPENSHELL_DENIAL_FLUSH_INTERVAL_SECS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(10); + + let aggregator = DenialAggregator::new(rx, flush_interval_secs); + + tokio::spawn(async move { + aggregator + .run(|summaries| { + let endpoint = agg_endpoint.clone(); + let sandbox_name = agg_name.clone(); + async move { + if let Err(e) = + flush_proposals_to_gateway(&endpoint, &sandbox_name, summaries).await + { + warn!(error = %e, "Failed to flush denial summaries to gateway"); + } + } + }) + .await; + }); + } + // Spawn bypass detection monitor (Linux only, proxy mode only). // Reads /dev/kmsg for nftables log entries and emits structured // tracing events for direct connection attempts that bypass the proxy. @@ -315,6 +352,67 @@ pub async fn run_networking( ca_file_paths, ssh_proxy_url, ssh_netns_fd, - denial_rx, }) } + +/// Flush aggregated denial summaries to the gateway via `SubmitPolicyAnalysis`. +async fn flush_proposals_to_gateway( + endpoint: &str, + sandbox_name: &str, + summaries: Vec, +) -> Result<()> { + use openshell_core::grpc_client::CachedOpenShellClient; + use openshell_core::proto::{DenialSummary, L7RequestSample}; + + let client = CachedOpenShellClient::connect(endpoint).await?; + + let proto_summaries: Vec = summaries + .into_iter() + .map(|s| DenialSummary { + sandbox_id: String::new(), + host: s.host, + port: u32::from(s.port), + binary: s.binary, + ancestors: s.ancestors, + deny_reason: s.deny_reason, + first_seen_ms: s.first_seen_ms, + last_seen_ms: s.last_seen_ms, + count: s.count, + suppressed_count: 0, + total_count: s.count, + sample_cmdlines: s.sample_cmdlines, + binary_sha256: String::new(), + persistent: false, + denial_stage: s.denial_stage, + l7_request_samples: s + .l7_samples + .into_iter() + .map(|l| L7RequestSample { + method: l.method, + path: l.path, + decision: "deny".to_string(), + count: l.count, + }) + .collect(), + l7_inspection_active: false, + }) + .collect(); + + // Run the mechanistic mapper sandbox-side to generate proposals. + // The gateway is a thin persistence + validation layer — it never + // generates proposals itself. + let proposals = mechanistic_mapper::generate_proposals(&proto_summaries); + + info!( + sandbox_name = %sandbox_name, + summaries = proto_summaries.len(), + proposals = proposals.len(), + "Flushed denial analysis to gateway" + ); + + client + .submit_policy_analysis(sandbox_name, proto_summaries, proposals, "mechanistic") + .await?; + + Ok(()) +} From 0a3bbdae9e214c4eb8658cf1d62af3a000aabbb5 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 14:28:33 +0300 Subject: [PATCH 37/49] refactor(supervisor-network): own symlink-resolution task Move the OPA binary-symlink resolver out of run_sandbox and into run_networking. The task probes /proc//root/ until the workload's mount namespace is accessible, then rebuilds the OPA engine with resolved binary paths so policy rules match canonical names instead of symlinks. Both inputs (Arc, retained_proto) are networking-leaf concerns and were already plumbed into run_networking; the entrypoint_pid Arc is read lazily after the process leaf populates it. Adds retained_proto as a parameter and spawns the resolver early in run_networking so the probe loop starts before the proxy comes up. Same shape as the denial-flush move: networking owns its own background task end-to-end; the orchestrator stops hosting work that doesn't conceptually belong to it. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 60 +----------------- .../openshell-supervisor-network/src/run.rs | 62 ++++++++++++++++++- 2 files changed, 61 insertions(+), 61 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index e41deb6a0..0c40894af 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -263,6 +263,7 @@ pub async fn run_sandbox( #[cfg(target_os = "linux")] netns.as_ref(), opa_engine.as_ref(), + retained_proto.as_ref(), entrypoint_pid.clone(), &provider_credentials, &policy_local_ctx, @@ -278,65 +279,6 @@ pub async fn run_sandbox( // listener and workload process are exposed. apply_supervisor_startup_hardening()?; - // Spawn a task to resolve policy binary symlinks after the container - // filesystem becomes accessible via /proc//root/. This expands - // symlinks like /usr/bin/python3 → /usr/bin/python3.11 in the OPA - // policy data so that either path matches at evaluation time. - // - // The task probes /proc//root/ with retries until accessible. It - // reads `entrypoint_pid` lazily, so spawning here (before `run_process` - // sets the PID) is safe — the probe loop just waits. - if let (Some(engine), Some(proto)) = (opa_engine.as_ref(), retained_proto.as_ref()) { - let resolve_engine = engine.clone(); - let resolve_proto = proto.clone(); - let resolve_pid = entrypoint_pid.clone(); - tokio::spawn(async move { - let pid = resolve_pid.load(Ordering::Acquire); - let probe_path = format!("/proc/{pid}/root/"); - // Retry up to 10 times with 500ms intervals (5s total). - // The child's mount namespace is typically ready within a - // few hundred ms of spawn. - for attempt in 1..=10 { - tokio::time::sleep(Duration::from_millis(500)).await; - if std::fs::metadata(&probe_path).is_ok() { - info!( - pid = pid, - attempt = attempt, - "Container filesystem accessible, resolving policy binary symlinks" - ); - match resolve_engine.reload_from_proto_with_pid(&resolve_proto, pid) { - Ok(()) => { - info!( - pid = pid, - "Policy binary symlink resolution complete \ - (check logs above for per-binary results)" - ); - } - Err(e) => { - warn!( - "Failed to rebuild OPA engine with symlink resolution \ - (non-fatal, falling back to literal path matching): {e}" - ); - } - } - return; - } - debug!( - pid = pid, - attempt = attempt, - probe_path = %probe_path, - "Container filesystem not yet accessible, retrying symlink resolution" - ); - } - warn!( - "Container filesystem /proc/{pid}/root/ not accessible after 10 attempts (5s); \ - binary symlink resolution skipped. Policy binary paths will be matched literally. \ - If binaries are symlinks, use canonical paths in your policy \ - (run 'readlink -f ' inside the sandbox)" - ); - }); - } - // Spawn background policy poll task (gRPC mode only). if let (Some(id), Some(endpoint), Some(engine)) = ( sandbox_id.as_deref(), diff --git a/crates/openshell-supervisor-network/src/run.rs b/crates/openshell-supervisor-network/src/run.rs index 2f14459de..6a6e9152e 100644 --- a/crates/openshell-supervisor-network/src/run.rs +++ b/crates/openshell-supervisor-network/src/run.rs @@ -11,14 +11,16 @@ use std::net::SocketAddr; use std::sync::Arc; -use std::sync::atomic::AtomicU32; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::time::Duration; use miette::Result; -use tracing::{info, warn}; +use tracing::{debug, info, warn}; #[cfg(target_os = "linux")] use openshell_core::netns::NetworkNamespace; use openshell_core::policy::{NetworkMode, SandboxPolicy}; +use openshell_core::proto::SandboxPolicy as ProtoSandboxPolicy; use openshell_core::provider_credentials::ProviderCredentialState; use openshell_ocsf::{ ConfigStateChangeBuilder, SeverityId, StateId, StatusId, ctx::ctx as ocsf_ctx, ocsf_emit, @@ -120,6 +122,7 @@ pub async fn run_networking( policy: &SandboxPolicy, #[cfg(target_os = "linux")] netns: Option<&NetworkNamespace>, opa_engine: Option<&Arc>, + retained_proto: Option<&ProtoSandboxPolicy>, entrypoint_pid: Arc, provider_credentials: &ProviderCredentialState, policy_local_ctx: &Arc, @@ -128,6 +131,61 @@ pub async fn run_networking( openshell_endpoint: Option<&str>, inference_routes: Option<&str>, ) -> Result { + // Spawn a task to resolve policy binary symlinks once the workload's mount + // namespace becomes accessible via /proc//root/. Reads entrypoint_pid + // lazily, so spawning before run_process sets the PID is safe — the probe + // loop just waits. + if let (Some(engine), Some(proto)) = (opa_engine, retained_proto) { + let resolve_engine = engine.clone(); + let resolve_proto = proto.clone(); + let resolve_pid = entrypoint_pid.clone(); + tokio::spawn(async move { + let pid = resolve_pid.load(Ordering::Acquire); + let probe_path = format!("/proc/{pid}/root/"); + // Retry up to 10 times with 500ms intervals (5s total). + // The child's mount namespace is typically ready within a + // few hundred ms of spawn. + for attempt in 1..=10 { + tokio::time::sleep(Duration::from_millis(500)).await; + if std::fs::metadata(&probe_path).is_ok() { + info!( + pid = pid, + attempt = attempt, + "Container filesystem accessible, resolving policy binary symlinks" + ); + match resolve_engine.reload_from_proto_with_pid(&resolve_proto, pid) { + Ok(()) => { + info!( + pid = pid, + "Policy binary symlink resolution complete \ + (check logs above for per-binary results)" + ); + } + Err(e) => { + warn!( + "Failed to rebuild OPA engine with symlink resolution \ + (non-fatal, falling back to literal path matching): {e}" + ); + } + } + return; + } + debug!( + pid = pid, + attempt = attempt, + probe_path = %probe_path, + "Container filesystem not yet accessible, retrying symlink resolution" + ); + } + warn!( + "Container filesystem /proc/{pid}/root/ not accessible after 10 attempts (5s); \ + binary symlink resolution skipped. Policy binary paths will be matched literally. \ + If binaries are symlinks, use canonical paths in your policy \ + (run 'readlink -f ' inside the sandbox)" + ); + }); + } + // Identity cache for SHA256 TOFU when OPA is active. Only consumed by // the proxy, so it's owned here. let identity_cache = opa_engine.map(|_| Arc::new(BinaryIdentityCache::new())); From 22f39c132383ddf694014f508037095ed31a0fdf Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 14:40:12 +0300 Subject: [PATCH 38/49] refactor(supervisor-process): move seccomp install into run_process The supervisor seccomp prelude is part of "set up the workload-side process tree", not part of orchestration. Move the call site from run_sandbox into the top of run_process and drop the now-unused re-export from openshell-sandbox::lib. Timing is preserved: by the time the orchestrator calls run_process, run_networking has already returned, so netns + nftables setup is complete. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 6 ------ crates/openshell-supervisor-process/src/run.rs | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 0c40894af..22fb758e0 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -56,7 +56,6 @@ use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPol use openshell_core::provider_credentials::ProviderCredentialState; use openshell_supervisor_network::opa::OpaEngine; pub use openshell_supervisor_process::process::{ProcessHandle, ProcessStatus}; -pub use openshell_supervisor_process::sandbox::apply_supervisor_startup_hardening; use openshell_supervisor_process::skills; /// Run a command in the sandbox. @@ -274,11 +273,6 @@ pub async fn run_sandbox( ) .await?; - // Install the supervisor seccomp prelude after privileged startup helpers - // (network namespace setup, nftables probes) complete, but before the SSH - // listener and workload process are exposed. - apply_supervisor_startup_hardening()?; - // Spawn background policy poll task (gRPC mode only). if let (Some(id), Some(endpoint), Some(engine)) = ( sandbox_id.as_deref(), diff --git a/crates/openshell-supervisor-process/src/run.rs b/crates/openshell-supervisor-process/src/run.rs index fc40859f8..209be49bc 100644 --- a/crates/openshell-supervisor-process/src/run.rs +++ b/crates/openshell-supervisor-process/src/run.rs @@ -60,6 +60,12 @@ pub async fn run_process( ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, #[cfg(target_os = "linux")] netns: Option<&NetworkNamespace>, ) -> Result { + // Install the supervisor seccomp prelude before spawning any workload-side + // tasks. By this point the orchestrator has finished privileged startup + // helpers (network namespace setup, nftables probes via run_networking), + // and the SSH listener and entrypoint child have not been exposed yet. + crate::sandbox::apply_supervisor_startup_hardening()?; + // Zombie reaper — openshell-sandbox may run as PID 1 in containers and // must reap orphaned grandchildren (e.g. background daemons started by // coding agents) to prevent zombie accumulation. From dfd2aa2f74f907f9e238de44fa0296c29fd8d8c5 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 14:42:59 +0300 Subject: [PATCH 39/49] refactor(supervisor-process): move check_runtime_pid_limit into run_process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PID-limit precondition is process-side: it gates whether the workload child can be spawned at all. Move the call from run_sandbox into the top of run_process, alongside the seccomp prelude. Same shape as the seccomp move — function already lives process-side, only the call site relocates. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 10 ---------- crates/openshell-supervisor-process/src/run.rs | 11 +++++++++++ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 22fb758e0..fef906831 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -191,16 +191,6 @@ pub async fn run_sandbox( // Prepare filesystem: create and chown read_write directories prepare_filesystem(&policy)?; - #[cfg(target_os = "linux")] - { - let pid_limit_mode = if std::env::var_os("OPENSHELL_REQUIRE_RUNTIME_PID_LIMIT").is_some() { - openshell_supervisor_process::process::RuntimePidLimitMode::Require - } else { - openshell_supervisor_process::process::RuntimePidLimitMode::Warn - }; - openshell_supervisor_process::process::check_runtime_pid_limit(pid_limit_mode)?; - } - // Initialize the agent-proposals feature flag. Default false until the // initial settings fetch (or the poll loop) tells us otherwise. The flag // gates the skill install, the policy.local route handler, and the L7 diff --git a/crates/openshell-supervisor-process/src/run.rs b/crates/openshell-supervisor-process/src/run.rs index 209be49bc..0874f03f1 100644 --- a/crates/openshell-supervisor-process/src/run.rs +++ b/crates/openshell-supervisor-process/src/run.rs @@ -66,6 +66,17 @@ pub async fn run_process( // and the SSH listener and entrypoint child have not been exposed yet. crate::sandbox::apply_supervisor_startup_hardening()?; + // Verify the runtime PID limit can accommodate the policy's pid_max. + #[cfg(target_os = "linux")] + { + let pid_limit_mode = if std::env::var_os("OPENSHELL_REQUIRE_RUNTIME_PID_LIMIT").is_some() { + crate::process::RuntimePidLimitMode::Require + } else { + crate::process::RuntimePidLimitMode::Warn + }; + crate::process::check_runtime_pid_limit(pid_limit_mode)?; + } + // Zombie reaper — openshell-sandbox may run as PID 1 in containers and // must reap orphaned grandchildren (e.g. background daemons started by // coding agents) to prevent zombie accumulation. From 748f57813658f3b05e4523e3f6f3f120dd3551f6 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 14:45:35 +0300 Subject: [PATCH 40/49] refactor(supervisor-process): move validate_sandbox_user to process crate The sandbox-user check is a precondition for privilege-dropping the workload child; it has no relevance to networking. Move the function next to drop_privileges in openshell-supervisor-process::process and call it from the top of run_process. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 44 ------------------- .../src/process.rs | 37 ++++++++++++++++ .../openshell-supervisor-process/src/run.rs | 6 +++ 3 files changed, 43 insertions(+), 44 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index fef906831..ade5c3d4c 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -126,11 +126,6 @@ pub async fn run_sandbox( ), ); - // Validate that the required "sandbox" user exists in this image. - // All sandbox images must include this user for privilege dropping. - #[cfg(unix)] - validate_sandbox_user(&policy)?; - // Fetch provider environment variables from the server. // This is done after loading the policy so the sandbox can still start // even if provider env fetch fails (graceful degradation). @@ -1143,45 +1138,6 @@ fn discover_policy_from_path(path: &std::path::Path) -> openshell_core::proto::S } } -/// Validate that the `sandbox` user exists in this image. -/// -/// All sandbox images must include a `sandbox` user for privilege dropping. -/// This check runs at supervisor startup (inside the container) where we can -/// inspect `/etc/passwd`. If the user is missing, the sandbox fails fast -/// with a clear error instead of silently running child processes as root. -#[cfg(unix)] -fn validate_sandbox_user(policy: &SandboxPolicy) -> Result<()> { - use nix::unistd::User; - - let user_name = policy.process.run_as_user.as_deref().unwrap_or("sandbox"); - - if user_name.is_empty() || user_name == "sandbox" { - match User::from_name("sandbox") { - Ok(Some(_)) => { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Enabled, "validated") - .message("Validated 'sandbox' user exists in image") - .build() - ); - } - Ok(None) => { - return Err(miette::miette!( - "sandbox user 'sandbox' not found in image; \ - all sandbox images must include a 'sandbox' user and group" - )); - } - Err(e) => { - return Err(miette::miette!("failed to look up 'sandbox' user: {e}")); - } - } - } - - Ok(()) -} - /// Prepare a `read_write` path for the sandboxed process. /// /// Returns `true` when the path was created by the supervisor and therefore diff --git a/crates/openshell-supervisor-process/src/process.rs b/crates/openshell-supervisor-process/src/process.rs index 2437bcb3b..60148bfd0 100644 --- a/crates/openshell-supervisor-process/src/process.rs +++ b/crates/openshell-supervisor-process/src/process.rs @@ -493,6 +493,43 @@ impl Drop for ProcessHandle { } } +/// Validate that the `sandbox` user exists in this image. +/// +/// All sandbox images must include a `sandbox` user for privilege dropping. +/// This check runs at supervisor startup (inside the container) where we can +/// inspect `/etc/passwd`. If the user is missing, the sandbox fails fast +/// with a clear error instead of silently running child processes as root. +#[cfg(unix)] +pub fn validate_sandbox_user(policy: &SandboxPolicy) -> Result<()> { + let user_name = policy.process.run_as_user.as_deref().unwrap_or("sandbox"); + + if user_name.is_empty() || user_name == "sandbox" { + match User::from_name("sandbox") { + Ok(Some(_)) => { + openshell_ocsf::ocsf_emit!( + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) + .severity(openshell_ocsf::SeverityId::Informational) + .status(openshell_ocsf::StatusId::Success) + .state(openshell_ocsf::StateId::Enabled, "validated") + .message("Validated 'sandbox' user exists in image") + .build() + ); + } + Ok(None) => { + return Err(miette::miette!( + "sandbox user 'sandbox' not found in image; \ + all sandbox images must include a 'sandbox' user and group" + )); + } + Err(e) => { + return Err(miette::miette!("failed to look up 'sandbox' user: {e}")); + } + } + } + + Ok(()) +} + // `effective_gid`/`effective_uid` are intentionally parallel names (same role // for different identifiers) and the noise from renaming would obscure intent. #[cfg(unix)] diff --git a/crates/openshell-supervisor-process/src/run.rs b/crates/openshell-supervisor-process/src/run.rs index 0874f03f1..746a0a16f 100644 --- a/crates/openshell-supervisor-process/src/run.rs +++ b/crates/openshell-supervisor-process/src/run.rs @@ -60,6 +60,12 @@ pub async fn run_process( ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, #[cfg(target_os = "linux")] netns: Option<&NetworkNamespace>, ) -> Result { + // Validate that the sandbox user exists in the image. All sandbox images + // must include a "sandbox" user for privilege dropping; failing fast here + // beats silently running children as root. + #[cfg(unix)] + crate::process::validate_sandbox_user(policy)?; + // Install the supervisor seccomp prelude before spawning any workload-side // tasks. By this point the orchestrator has finished privileged startup // helpers (network namespace setup, nftables probes via run_networking), From c05febbf2e5b40e0c5a16f363d33ffef467eed05 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 14:48:54 +0300 Subject: [PATCH 41/49] refactor(supervisor-process): move prepare_filesystem to process crate Creating and chowning read_write directories is workload-side preparation, not orchestration. Move prepare_filesystem and its prepare_read_write_path helper (plus tests) into openshell-supervisor-process::process and call from run_process, alongside validate_sandbox_user. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 206 +----------------- .../src/process.rs | 200 +++++++++++++++++ .../openshell-supervisor-process/src/run.rs | 6 + 3 files changed, 207 insertions(+), 205 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index ade5c3d4c..d0361188b 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -5,7 +5,7 @@ //! //! This crate provides process sandboxing and monitoring capabilities. -use miette::{IntoDiagnostic, Result}; +use miette::Result; use std::future::Future; use std::sync::Arc; use std::sync::atomic::{AtomicU32, Ordering}; @@ -183,9 +183,6 @@ pub async fn run_sandbox( ); let provider_env = provider_credentials.snapshot().child_env.clone(); - // Prepare filesystem: create and chown read_write directories - prepare_filesystem(&policy)?; - // Initialize the agent-proposals feature flag. Default false until the // initial settings fetch (or the poll loop) tells us otherwise. The flag // gates the skill install, the policy.local route handler, and the L7 @@ -1138,106 +1135,6 @@ fn discover_policy_from_path(path: &std::path::Path) -> openshell_core::proto::S } } -/// Prepare a `read_write` path for the sandboxed process. -/// -/// Returns `true` when the path was created by the supervisor and therefore -/// still needs to be chowned to the sandbox user/group. Existing paths keep -/// their image-defined ownership. -#[cfg(unix)] -fn prepare_read_write_path(path: &std::path::Path) -> Result { - // SECURITY: use symlink_metadata (lstat) to inspect each path *before* - // calling chown. chown follows symlinks, so a malicious container image - // could place a symlink (e.g. /sandbox -> /etc/shadow) to trick the - // root supervisor into transferring ownership of arbitrary files. - // The TOCTOU window between lstat and chown is not exploitable because - // no untrusted process is running yet (the child has not been forked). - if let Ok(meta) = std::fs::symlink_metadata(path) { - if meta.file_type().is_symlink() { - return Err(miette::miette!( - "read_write path '{}' is a symlink — refusing to chown (potential privilege escalation)", - path.display() - )); - } - - debug!( - path = %path.display(), - "Preserving ownership for existing read_write path" - ); - Ok(false) - } else { - debug!(path = %path.display(), "Creating read_write directory"); - std::fs::create_dir_all(path).into_diagnostic()?; - Ok(true) - } -} - -/// Prepare filesystem for the sandboxed process. -/// -/// Creates `read_write` directories if they don't exist and sets ownership -/// on newly-created paths to the configured sandbox user/group. This runs as -/// the supervisor (root) before forking the child process. -#[cfg(unix)] -fn prepare_filesystem(policy: &SandboxPolicy) -> Result<()> { - use nix::unistd::{Group, User, chown}; - - let user_name = match policy.process.run_as_user.as_deref() { - Some(name) if !name.is_empty() => Some(name), - _ => None, - }; - let group_name = match policy.process.run_as_group.as_deref() { - Some(name) if !name.is_empty() => Some(name), - _ => None, - }; - - // If no user/group configured, nothing to do - if user_name.is_none() && group_name.is_none() { - return Ok(()); - } - - // Resolve user and group - let uid = if let Some(name) = user_name { - Some( - User::from_name(name) - .into_diagnostic()? - .ok_or_else(|| miette::miette!("Sandbox user not found: {name}"))? - .uid, - ) - } else { - None - }; - - let gid = if let Some(name) = group_name { - Some( - Group::from_name(name) - .into_diagnostic()? - .ok_or_else(|| miette::miette!("Sandbox group not found: {name}"))? - .gid, - ) - } else { - None - }; - - // Create missing read_write paths and only chown the ones we created. - for path in &policy.filesystem.read_write { - if prepare_read_write_path(path)? { - debug!( - path = %path.display(), - ?uid, - ?gid, - "Setting ownership on newly created read_write path" - ); - chown(path, uid, gid).into_diagnostic()?; - } - } - - Ok(()) -} - -#[cfg(not(unix))] -fn prepare_filesystem(_policy: &SandboxPolicy) -> Result<()> { - Ok(()) -} - /// Background loop that polls the server for policy updates. /// /// When a new version is detected, attempts to reload the OPA engine via @@ -1589,11 +1486,6 @@ fn format_setting_value(es: &openshell_core::proto::EffectiveSetting) -> String )] mod tests { use super::*; - #[cfg(unix)] - use nix::unistd::{Group, User}; - use openshell_core::policy::{FilesystemPolicy, LandlockPolicy, ProcessPolicy}; - #[cfg(unix)] - use std::os::unix::fs::{MetadataExt, symlink}; // ---- Policy disk discovery tests ---- #[test] @@ -1687,100 +1579,4 @@ filesystem_policy: let local_policy = SandboxPolicy::try_from(proto).expect("conversion should succeed"); assert!(matches!(local_policy.network.mode, NetworkMode::Proxy)); } - - #[cfg(unix)] - fn sandbox_policy_with_read_write( - path: std::path::PathBuf, - run_as_user: Option, - run_as_group: Option, - ) -> SandboxPolicy { - SandboxPolicy { - version: 1, - filesystem: FilesystemPolicy { - read_only: vec![], - read_write: vec![path], - include_workdir: false, - }, - network: NetworkPolicy::default(), - landlock: LandlockPolicy::default(), - process: ProcessPolicy { - run_as_user, - run_as_group, - }, - } - } - - #[cfg(unix)] - #[test] - fn prepare_read_write_path_creates_missing_directory() { - let dir = tempfile::tempdir().unwrap(); - let missing = dir.path().join("missing").join("nested"); - - assert!(prepare_read_write_path(&missing).unwrap()); - assert!(missing.is_dir()); - } - - #[cfg(unix)] - #[test] - fn prepare_read_write_path_preserves_existing_directory() { - let dir = tempfile::tempdir().unwrap(); - let existing = dir.path().join("existing"); - std::fs::create_dir(&existing).unwrap(); - - assert!(!prepare_read_write_path(&existing).unwrap()); - assert!(existing.is_dir()); - } - - #[cfg(unix)] - #[test] - fn prepare_read_write_path_rejects_symlink() { - let dir = tempfile::tempdir().unwrap(); - let target = dir.path().join("target"); - let link = dir.path().join("link"); - std::fs::create_dir(&target).unwrap(); - symlink(&target, &link).unwrap(); - - let error = prepare_read_write_path(&link).unwrap_err(); - assert!( - error - .to_string() - .contains("is a symlink — refusing to chown"), - "unexpected error: {error}" - ); - } - - #[cfg(unix)] - #[test] - fn prepare_filesystem_skips_chown_for_existing_read_write_paths() { - if nix::unistd::geteuid().is_root() { - return; - } - - let current_user = User::from_uid(nix::unistd::geteuid()) - .unwrap() - .expect("current user entry"); - let restricted_group = Group::from_gid(nix::unistd::Gid::from_raw(0)) - .unwrap() - .expect("gid 0 group entry"); - if restricted_group.gid == nix::unistd::getegid() { - return; - } - - let dir = tempfile::tempdir().unwrap(); - let existing = dir.path().join("existing"); - std::fs::create_dir(&existing).unwrap(); - let before = std::fs::metadata(&existing).unwrap(); - - let policy = sandbox_policy_with_read_write( - existing.clone(), - Some(current_user.name), - Some(restricted_group.name), - ); - - prepare_filesystem(&policy).expect("existing path should not be re-owned"); - - let after = std::fs::metadata(&existing).unwrap(); - assert_eq!(after.uid(), before.uid()); - assert_eq!(after.gid(), before.gid()); - } } diff --git a/crates/openshell-supervisor-process/src/process.rs b/crates/openshell-supervisor-process/src/process.rs index 60148bfd0..cfd9e7b6e 100644 --- a/crates/openshell-supervisor-process/src/process.rs +++ b/crates/openshell-supervisor-process/src/process.rs @@ -530,6 +530,106 @@ pub fn validate_sandbox_user(policy: &SandboxPolicy) -> Result<()> { Ok(()) } +/// Prepare a `read_write` path for the sandboxed process. +/// +/// Returns `true` when the path was created by the supervisor and therefore +/// still needs to be chowned to the sandbox user/group. Existing paths keep +/// their image-defined ownership. +#[cfg(unix)] +fn prepare_read_write_path(path: &std::path::Path) -> Result { + // SECURITY: use symlink_metadata (lstat) to inspect each path *before* + // calling chown. chown follows symlinks, so a malicious container image + // could place a symlink (e.g. /sandbox -> /etc/shadow) to trick the + // root supervisor into transferring ownership of arbitrary files. + // The TOCTOU window between lstat and chown is not exploitable because + // no untrusted process is running yet (the child has not been forked). + if let Ok(meta) = std::fs::symlink_metadata(path) { + if meta.file_type().is_symlink() { + return Err(miette::miette!( + "read_write path '{}' is a symlink — refusing to chown (potential privilege escalation)", + path.display() + )); + } + + debug!( + path = %path.display(), + "Preserving ownership for existing read_write path" + ); + Ok(false) + } else { + debug!(path = %path.display(), "Creating read_write directory"); + std::fs::create_dir_all(path).into_diagnostic()?; + Ok(true) + } +} + +/// Prepare filesystem for the sandboxed process. +/// +/// Creates `read_write` directories if they don't exist and sets ownership +/// on newly-created paths to the configured sandbox user/group. This runs as +/// the supervisor (root) before forking the child process. +#[cfg(unix)] +pub fn prepare_filesystem(policy: &SandboxPolicy) -> Result<()> { + use nix::unistd::chown; + + let user_name = match policy.process.run_as_user.as_deref() { + Some(name) if !name.is_empty() => Some(name), + _ => None, + }; + let group_name = match policy.process.run_as_group.as_deref() { + Some(name) if !name.is_empty() => Some(name), + _ => None, + }; + + // If no user/group configured, nothing to do + if user_name.is_none() && group_name.is_none() { + return Ok(()); + } + + // Resolve user and group + let uid = if let Some(name) = user_name { + Some( + User::from_name(name) + .into_diagnostic()? + .ok_or_else(|| miette::miette!("Sandbox user not found: {name}"))? + .uid, + ) + } else { + None + }; + + let gid = if let Some(name) = group_name { + Some( + Group::from_name(name) + .into_diagnostic()? + .ok_or_else(|| miette::miette!("Sandbox group not found: {name}"))? + .gid, + ) + } else { + None + }; + + // Create missing read_write paths and only chown the ones we created. + for path in &policy.filesystem.read_write { + if prepare_read_write_path(path)? { + debug!( + path = %path.display(), + ?uid, + ?gid, + "Setting ownership on newly created read_write path" + ); + chown(path, uid, gid).into_diagnostic()?; + } + } + + Ok(()) +} + +#[cfg(not(unix))] +pub fn prepare_filesystem(_policy: &SandboxPolicy) -> Result<()> { + Ok(()) +} + // `effective_gid`/`effective_uid` are intentionally parallel names (same role // for different identifiers) and the noise from renaming would obscure intent. #[cfg(unix)] @@ -972,4 +1072,104 @@ mod tests { let stdout = String::from_utf8(output.stdout).expect("utf8"); assert!(stdout.contains("ANTHROPIC_API_KEY=openshell:resolve:env:ANTHROPIC_API_KEY")); } + + #[cfg(unix)] + fn sandbox_policy_with_read_write( + path: PathBuf, + run_as_user: Option, + run_as_group: Option, + ) -> SandboxPolicy { + SandboxPolicy { + version: 1, + filesystem: FilesystemPolicy { + read_only: vec![], + read_write: vec![path], + include_workdir: false, + }, + network: NetworkPolicy::default(), + landlock: LandlockPolicy::default(), + process: ProcessPolicy { + run_as_user, + run_as_group, + }, + } + } + + #[cfg(unix)] + #[test] + fn prepare_read_write_path_creates_missing_directory() { + let dir = tempfile::tempdir().unwrap(); + let missing = dir.path().join("missing").join("nested"); + + assert!(prepare_read_write_path(&missing).unwrap()); + assert!(missing.is_dir()); + } + + #[cfg(unix)] + #[test] + fn prepare_read_write_path_preserves_existing_directory() { + let dir = tempfile::tempdir().unwrap(); + let existing = dir.path().join("existing"); + std::fs::create_dir(&existing).unwrap(); + + assert!(!prepare_read_write_path(&existing).unwrap()); + assert!(existing.is_dir()); + } + + #[cfg(unix)] + #[test] + fn prepare_read_write_path_rejects_symlink() { + use std::os::unix::fs::symlink; + + let dir = tempfile::tempdir().unwrap(); + let target = dir.path().join("target"); + let link = dir.path().join("link"); + std::fs::create_dir(&target).unwrap(); + symlink(&target, &link).unwrap(); + + let error = prepare_read_write_path(&link).unwrap_err(); + assert!( + error + .to_string() + .contains("is a symlink — refusing to chown"), + "unexpected error: {error}" + ); + } + + #[cfg(unix)] + #[test] + fn prepare_filesystem_skips_chown_for_existing_read_write_paths() { + use std::os::unix::fs::MetadataExt; + + if nix::unistd::geteuid().is_root() { + return; + } + + let current_user = User::from_uid(nix::unistd::geteuid()) + .unwrap() + .expect("current user entry"); + let restricted_group = Group::from_gid(nix::unistd::Gid::from_raw(0)) + .unwrap() + .expect("gid 0 group entry"); + if restricted_group.gid == nix::unistd::getegid() { + return; + } + + let dir = tempfile::tempdir().unwrap(); + let existing = dir.path().join("existing"); + std::fs::create_dir(&existing).unwrap(); + let before = std::fs::metadata(&existing).unwrap(); + + let policy = sandbox_policy_with_read_write( + existing.clone(), + Some(current_user.name), + Some(restricted_group.name), + ); + + prepare_filesystem(&policy).expect("existing path should not be re-owned"); + + let after = std::fs::metadata(&existing).unwrap(); + assert_eq!(after.uid(), before.uid()); + assert_eq!(after.gid(), before.gid()); + } } diff --git a/crates/openshell-supervisor-process/src/run.rs b/crates/openshell-supervisor-process/src/run.rs index 746a0a16f..0b6572796 100644 --- a/crates/openshell-supervisor-process/src/run.rs +++ b/crates/openshell-supervisor-process/src/run.rs @@ -66,6 +66,12 @@ pub async fn run_process( #[cfg(unix)] crate::process::validate_sandbox_user(policy)?; + // Create read_write directories and chown newly-created ones to the + // sandbox user/group. Runs as the supervisor (root) before the child + // is forked so the workload sees writable paths it owns. + #[cfg(unix)] + crate::process::prepare_filesystem(policy)?; + // Install the supervisor seccomp prelude before spawning any workload-side // tasks. By this point the orchestrator has finished privileged startup // helpers (network namespace setup, nftables probes via run_networking), From 0830218aba4ac06089fb651267bf05a97a6a5995 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 14:51:56 +0300 Subject: [PATCH 42/49] refactor(supervisor-process): move startup skill install into run_process The eager initial-settings fetch + agent skill install is process-side: the install materializes files the workload's filesystem sees. The orchestrator still owns the AGENT_PROPOSALS_ENABLED OnceLock init because the policy poll loop also reads it; only the early fetch and install hop into run_process. Behavior unchanged. Best-effort: any RPC or install failure is logged but does not fail startup. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 36 +---------- .../openshell-supervisor-process/src/run.rs | 61 +++++++++++++++++++ 2 files changed, 63 insertions(+), 34 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index d0361188b..91274840d 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -8,7 +8,7 @@ use miette::Result; use std::future::Future; use std::sync::Arc; -use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::atomic::AtomicU32; use std::time::Duration; use tracing::{debug, info, warn}; @@ -50,7 +50,7 @@ pub(crate) use openshell_ocsf::ctx::ctx as ocsf_ctx; /// to gate the agent-controlled mutation surface. Exposed `pub(crate)` so /// unit tests in sibling modules can flip the flag through a serialized /// guard (see `policy_local::tests::ProposalsFlagGuard`). -pub(crate) use openshell_core::proposals::{AGENT_PROPOSALS_ENABLED, agent_proposals_enabled}; +pub(crate) use openshell_core::proposals::AGENT_PROPOSALS_ENABLED; use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; use openshell_core::provider_credentials::ProviderCredentialState; @@ -195,38 +195,6 @@ pub async fn run_sandbox( debug!("agent proposals flag already initialized, keeping existing"); } - // Eagerly fetch the initial settings so skill install can honor the flag - // at startup rather than waiting for the poll loop's first tick. In - // offline/file-mode there is no gateway, so the flag stays false. - if let (Some(id), Some(endpoint)) = (&sandbox_id, &openshell_endpoint) - && let Ok(client) = - openshell_core::grpc_client::CachedOpenShellClient::connect(endpoint).await - && let Ok(result) = client.poll_settings(id).await - { - let initial = extract_bool_setting( - &result.settings, - openshell_core::settings::AGENT_POLICY_PROPOSALS_ENABLED_KEY, - ) - .unwrap_or(false); - proposals_enabled.store(initial, Ordering::Relaxed); - } - - if agent_proposals_enabled() { - match skills::install_static_skills() { - Ok(installed) => { - info!( - path = %installed.policy_advisor.display(), - "Installed sandbox agent skill" - ); - } - Err(error) => { - warn!(error = %error, "Failed to install sandbox agent skill"); - } - } - } else { - debug!("agent_policy_proposals_enabled is false at startup; skipping skill install"); - } - // Shared PID: set after process spawn so the proxy can look up // the entrypoint process's /proc/net/tcp for identity binding. let entrypoint_pid = Arc::new(AtomicU32::new(0)); diff --git a/crates/openshell-supervisor-process/src/run.rs b/crates/openshell-supervisor-process/src/run.rs index 0b6572796..e6b59b953 100644 --- a/crates/openshell-supervisor-process/src/run.rs +++ b/crates/openshell-supervisor-process/src/run.rs @@ -72,6 +72,12 @@ pub async fn run_process( #[cfg(unix)] crate::process::prepare_filesystem(policy)?; + // Eagerly fetch initial settings and install the agent skill if the + // proposals flag is on at startup, rather than waiting for the policy + // poll loop's first tick. In offline/file-mode there is no gateway, so + // the flag stays at its default (false) and no skill is installed. + install_initial_agent_skill(sandbox_id, openshell_endpoint).await; + // Install the supervisor seccomp prelude before spawning any workload-side // tasks. By this point the orchestrator has finished privileged startup // helpers (network namespace setup, nftables probes via run_networking), @@ -312,3 +318,58 @@ pub async fn run_process( Ok(status.code()) } + +/// Eagerly fetch initial settings and install the agent-driven policy +/// proposal skill if the flag is on at startup. +/// +/// Without this, the skill would only get installed on the policy poll +/// loop's first false→true transition, which can be ~10 s after launch — +/// long enough for an agent to start running without seeing it. +/// +/// Best-effort: any failure (no gateway, RPC error, install failure) is +/// logged but does not fail sandbox startup. +async fn install_initial_agent_skill(sandbox_id: Option<&str>, openshell_endpoint: Option<&str>) { + use openshell_core::proto::setting_value; + use std::sync::atomic::Ordering; + + let Some(flag) = openshell_core::proposals::AGENT_PROPOSALS_ENABLED.get() else { + // The orchestrator is responsible for setting the OnceLock before + // calling run_process. If it isn't set, behave as if the flag is + // off and skip the install. + tracing::debug!("AGENT_PROPOSALS_ENABLED not initialized; skipping skill install"); + return; + }; + + if let (Some(id), Some(endpoint)) = (sandbox_id, openshell_endpoint) + && let Ok(client) = + openshell_core::grpc_client::CachedOpenShellClient::connect(endpoint).await + && let Ok(result) = client.poll_settings(id).await + { + let initial = result + .settings + .get(openshell_core::settings::AGENT_POLICY_PROPOSALS_ENABLED_KEY) + .and_then(|es| es.value.as_ref()) + .and_then(|sv| sv.value.as_ref()) + .and_then(|v| match v { + setting_value::Value::BoolValue(b) => Some(*b), + _ => None, + }) + .unwrap_or(false); + flag.store(initial, Ordering::Relaxed); + } + + if openshell_core::proposals::agent_proposals_enabled() { + match crate::skills::install_static_skills() { + Ok(installed) => info!( + path = %installed.policy_advisor.display(), + "Installed sandbox agent skill" + ), + Err(error) => tracing::warn!( + error = %error, + "Failed to install sandbox agent skill" + ), + } + } else { + tracing::debug!("agent_policy_proposals_enabled is false at startup; skipping skill install"); + } +} From d0335cfa85740fa15b71926995b9ce4ef7d02c2b Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 14:57:47 +0300 Subject: [PATCH 43/49] refactor(supervisor-network): own PolicyLocalContext construction Move the PolicyLocalContext construction from run_sandbox into run_networking. The orchestrator was building it solely to thread it into the networking leaf and to share it with the policy poll loop; now run_networking builds it from inputs it already takes (retained_proto, openshell_endpoint, sandbox_name|sandbox_id) and exposes it on the returned Networking struct. The orchestrator's poll loop now grabs the Arc clone from networking.policy_local_ctx, so the orchestrator no longer imports openshell_supervisor_network::policy_local. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 10 +--------- crates/openshell-supervisor-network/src/run.rs | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 91274840d..b9b0e7925 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -118,13 +118,6 @@ pub async fn run_sandbox( policy_data, ) .await?; - let policy_local_ctx = Arc::new( - openshell_supervisor_network::policy_local::PolicyLocalContext::new( - retained_proto.clone(), - openshell_endpoint.clone(), - sandbox_name_for_agg.clone().or_else(|| sandbox_id.clone()), - ), - ); // Fetch provider environment variables from the server. // This is done after loading the policy so the sandbox can still start @@ -215,7 +208,6 @@ pub async fn run_sandbox( retained_proto.as_ref(), entrypoint_pid.clone(), &provider_credentials, - &policy_local_ctx, sandbox_id.as_deref(), sandbox_name_for_agg.as_deref(), openshell_endpoint_for_proxy.as_deref(), @@ -235,7 +227,7 @@ pub async fn run_sandbox( let poll_ocsf_enabled = ocsf_enabled.clone(); let poll_pid = entrypoint_pid.clone(); let poll_provider_credentials = provider_credentials.clone(); - let poll_policy_local = policy_local_ctx.clone(); + let poll_policy_local = networking.policy_local_ctx.clone(); let poll_interval_secs: u64 = std::env::var("OPENSHELL_POLICY_POLL_INTERVAL_SECS") .ok() .and_then(|v| v.parse().ok()) diff --git a/crates/openshell-supervisor-network/src/run.rs b/crates/openshell-supervisor-network/src/run.rs index 6a6e9152e..962f975bc 100644 --- a/crates/openshell-supervisor-network/src/run.rs +++ b/crates/openshell-supervisor-network/src/run.rs @@ -103,6 +103,10 @@ pub struct Networking { pub ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, pub ssh_proxy_url: Option, pub ssh_netns_fd: Option, + /// Policy-local route context: shared with the orchestrator's policy poll + /// loop so it can publish updated `SandboxPolicy` snapshots that the + /// `policy.local` route handler returns to the workload. + pub policy_local_ctx: Arc, } /// Set up the networking stack: ephemeral CA + TLS state, proxy server, @@ -125,12 +129,22 @@ pub async fn run_networking( retained_proto: Option<&ProtoSandboxPolicy>, entrypoint_pid: Arc, provider_credentials: &ProviderCredentialState, - policy_local_ctx: &Arc, sandbox_id: Option<&str>, sandbox_name: Option<&str>, openshell_endpoint: Option<&str>, inference_routes: Option<&str>, ) -> Result { + // Build the policy-local route context. The orchestrator's policy poll + // loop also holds an `Arc` clone (via `Networking::policy_local_ctx`) so + // it can publish updated policy snapshots after a successful reload. + let policy_local_ctx = Arc::new(PolicyLocalContext::new( + retained_proto.cloned(), + openshell_endpoint.map(str::to_string), + sandbox_name + .map(str::to_string) + .or_else(|| sandbox_id.map(str::to_string)), + )); + // Spawn a task to resolve policy binary symlinks once the workload's mount // namespace becomes accessible via /proc//root/. Reads entrypoint_pid // lazily, so spawning before run_process sets the PID is safe — the probe @@ -410,6 +424,7 @@ pub async fn run_networking( ca_file_paths, ssh_proxy_url, ssh_netns_fd, + policy_local_ctx, }) } From f525d8691bdc72c22c115ac881121a73170324aa Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 15:17:25 +0300 Subject: [PATCH 44/49] feat(supervisor): add --mode flag to gate network/process leaves Add a --mode flag (default "network,process") that selects which supervisor leaves run in the current process. Two new shapes are unlocked without splitting the binary: --mode=network # network-only sidecar --mode=process # process-only supervisor --mode=network,process # combined (default; current behavior) In network-only mode the orchestrator skips run_process and waits on SIGINT/SIGTERM before tearing down the proxy. The entrypoint PID stays at 0 for the lifetime of the process, which silently degrades the proxy's binary-identity TOFU and the bypass monitor's PID enrichment; this is correct in a split-pod topology where the workload's /proc lives in another pod. In process-only mode run_networking is skipped entirely. SSH sessions get no proxy URL, no netns FD, and no CA paths, matching what a split-pod consumer would expect when network enforcement is delegated to a sidecar. The policy poll loop continues to run unconditionally; its OPA-reload and policy.local hooks already gate on the resources only present when network is enabled, and the env-refresh / proposals-toggle hooks remain active in process mode. Closes a step toward the RFC-0001 supervisor topology proposed in issue #1305 by drew. Signed-off-by: Radoslav Hubenov --- crates/openshell-sandbox/src/lib.rs | 146 ++++++++++++++++++++------- crates/openshell-sandbox/src/main.rs | 49 +++++++++ 2 files changed, 157 insertions(+), 38 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index b9b0e7925..101eafa3d 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -63,7 +63,11 @@ use openshell_supervisor_process::skills; /// # Errors /// /// Returns an error if the command fails to start or encounters a fatal error. -#[allow(clippy::too_many_arguments, clippy::similar_names)] +#[allow( + clippy::too_many_arguments, + clippy::similar_names, + clippy::fn_params_excessive_bools +)] pub async fn run_sandbox( command: Vec, workdir: Option, @@ -79,6 +83,8 @@ pub async fn run_sandbox( _health_port: u16, inference_routes: Option, ocsf_enabled: Arc, + network_enabled: bool, + process_enabled: bool, ) -> Result { let (program, args) = command .split_first() @@ -198,22 +204,32 @@ pub async fn run_sandbox( // it via setns(). The RAII handle lives in this frame for the duration // of the sandbox. #[cfg(target_os = "linux")] - let netns = openshell_supervisor_network::run::create_netns_for_proxy(&policy)?; + let netns = if network_enabled { + openshell_supervisor_network::run::create_netns_for_proxy(&policy)? + } else { + None + }; - let mut networking = openshell_supervisor_network::run::run_networking( - &policy, - #[cfg(target_os = "linux")] - netns.as_ref(), - opa_engine.as_ref(), - retained_proto.as_ref(), - entrypoint_pid.clone(), - &provider_credentials, - sandbox_id.as_deref(), - sandbox_name_for_agg.as_deref(), - openshell_endpoint_for_proxy.as_deref(), - inference_routes.as_deref(), - ) - .await?; + let mut networking = if network_enabled { + Some( + openshell_supervisor_network::run::run_networking( + &policy, + #[cfg(target_os = "linux")] + netns.as_ref(), + opa_engine.as_ref(), + retained_proto.as_ref(), + entrypoint_pid.clone(), + &provider_credentials, + sandbox_id.as_deref(), + sandbox_name_for_agg.as_deref(), + openshell_endpoint_for_proxy.as_deref(), + inference_routes.as_deref(), + ) + .await?, + ) + } else { + None + }; // Spawn background policy poll task (gRPC mode only). if let (Some(id), Some(endpoint), Some(engine)) = ( @@ -227,7 +243,7 @@ pub async fn run_sandbox( let poll_ocsf_enabled = ocsf_enabled.clone(); let poll_pid = entrypoint_pid.clone(); let poll_provider_credentials = provider_credentials.clone(); - let poll_policy_local = networking.policy_local_ctx.clone(); + let poll_policy_local = networking.as_ref().map(|n| n.policy_local_ctx.clone()); let poll_interval_secs: u64 = std::env::var("OPENSHELL_POLICY_POLL_INTERVAL_SECS") .ok() .and_then(|v| v.parse().ok()) @@ -240,7 +256,7 @@ pub async fn run_sandbox( interval_secs: poll_interval_secs, ocsf_enabled: poll_ocsf_enabled, provider_credentials: poll_provider_credentials, - policy_local_ctx: Some(poll_policy_local), + policy_local_ctx: poll_policy_local, }; tokio::spawn(async move { @@ -257,30 +273,84 @@ pub async fn run_sandbox( }); } - let exit_code = openshell_supervisor_process::run::run_process( - program, - args, - workdir.as_deref(), - timeout_secs, - interactive, - sandbox_id.as_deref(), - openshell_endpoint.as_deref(), - ssh_socket_path, - &policy, - entrypoint_pid, - provider_credentials, - provider_env, - networking.ssh_proxy_url.take(), - networking.ssh_netns_fd, - networking.ca_file_paths.clone(), - #[cfg(target_os = "linux")] - netns.as_ref(), - ) - .await?; + let exit_code = if process_enabled { + let (ssh_proxy_url, ssh_netns_fd, ca_file_paths) = match networking.as_mut() { + Some(n) => ( + n.ssh_proxy_url.take(), + n.ssh_netns_fd, + n.ca_file_paths.clone(), + ), + None => (None, None, None), + }; + + openshell_supervisor_process::run::run_process( + program, + args, + workdir.as_deref(), + timeout_secs, + interactive, + sandbox_id.as_deref(), + openshell_endpoint.as_deref(), + ssh_socket_path, + &policy, + entrypoint_pid, + provider_credentials, + provider_env, + ssh_proxy_url, + ssh_netns_fd, + ca_file_paths, + #[cfg(target_os = "linux")] + netns.as_ref(), + ) + .await? + } else { + // Network-only sidecar mode: keep the proxy and its background + // tasks alive (held via the `networking` value) until SIGINT or + // SIGTERM. Exit 0 on clean shutdown. + wait_for_shutdown_signal().await; + 0 + }; + + // Drop networking explicitly so the proxy + bypass monitor RAII + // handles tear down before we return. + drop(networking); Ok(exit_code) } +/// Wait for SIGINT or SIGTERM. Used in network-only mode where there is +/// no entrypoint child whose lifetime drives the supervisor's exit. +async fn wait_for_shutdown_signal() { + #[cfg(unix)] + { + use tokio::signal::unix::{SignalKind, signal}; + let mut sigterm = match signal(SignalKind::terminate()) { + Ok(s) => s, + Err(e) => { + tracing::warn!( + error = %e, + "Failed to install SIGTERM handler; waiting on SIGINT only" + ); + let _ = tokio::signal::ctrl_c().await; + return; + } + }; + tokio::select! { + _ = tokio::signal::ctrl_c() => { + info!("Received SIGINT, shutting down network-only supervisor"); + } + _ = sigterm.recv() => { + info!("Received SIGTERM, shutting down network-only supervisor"); + } + } + } + #[cfg(not(unix))] + { + let _ = tokio::signal::ctrl_c().await; + info!("Received Ctrl-C, shutting down network-only supervisor"); + } +} + // ============================================================================ // Baseline filesystem path enrichment // ============================================================================ diff --git a/crates/openshell-sandbox/src/main.rs b/crates/openshell-sandbox/src/main.rs index 6e82acdba..91b145c2e 100644 --- a/crates/openshell-sandbox/src/main.rs +++ b/crates/openshell-sandbox/src/main.rs @@ -33,6 +33,45 @@ const COPY_SELF_SUBCOMMAND: &str = "copy-self"; /// to confirm the cross-sandbox IDOR guard fires. const DEBUG_RPC_SUBCOMMAND: &str = "debug-rpc"; +/// Default `--mode` value: run both supervisor leaves in a single binary. +const DEFAULT_MODE: &str = "network,process"; + +/// Which supervisor leaves are enabled in this process. +/// +/// Parsed from a comma-separated `--mode` value, e.g. `network`, +/// `process`, or `network,process`. At least one must be set. +#[derive(Clone, Copy, Debug)] +struct Mode { + network: bool, + process: bool, +} + +impl std::str::FromStr for Mode { + type Err = String; + + fn from_str(s: &str) -> Result { + let mut mode = Self { + network: false, + process: false, + }; + for part in s.split(',').map(str::trim).filter(|p| !p.is_empty()) { + match part { + "network" => mode.network = true, + "process" => mode.process = true, + other => { + return Err(format!( + "unknown mode component '{other}' (expected 'network' and/or 'process')" + )); + } + } + } + if !mode.network && !mode.process { + return Err("--mode must enable at least one of: network, process".into()); + } + Ok(mode) + } +} + /// `OpenShell` Sandbox - process isolation and monitoring. #[derive(Parser, Debug)] #[command(name = "openshell-sandbox")] @@ -105,6 +144,14 @@ struct Args { /// Port for health check endpoint. #[arg(long, default_value = "8080")] health_port: u16, + + /// Which supervisor components to run. Comma-separated list of + /// "network" and/or "process". Defaults to both (single-binary + /// topology). Use --mode=network for a network-only sidecar, or + /// --mode=process for a process-only supervisor when network + /// enforcement runs in another pod. + #[arg(long, default_value = DEFAULT_MODE)] + mode: Mode, } /// Copy the running executable to `dest`, creating parent directories as @@ -308,6 +355,8 @@ fn main() -> Result<()> { args.health_port, args.inference_routes, ocsf_enabled, + args.mode.network, + args.mode.process, ) .await })?; From 76a89cf06595fcd28a5e711920282ffe1e649205 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 15:39:10 +0300 Subject: [PATCH 45/49] style(supervisor-process): rustfmt long debug! line Signed-off-by: Radoslav Hubenov --- crates/openshell-supervisor-process/src/run.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/openshell-supervisor-process/src/run.rs b/crates/openshell-supervisor-process/src/run.rs index e6b59b953..f3e77dc20 100644 --- a/crates/openshell-supervisor-process/src/run.rs +++ b/crates/openshell-supervisor-process/src/run.rs @@ -370,6 +370,8 @@ async fn install_initial_agent_skill(sandbox_id: Option<&str>, openshell_endpoin ), } } else { - tracing::debug!("agent_policy_proposals_enabled is false at startup; skipping skill install"); + tracing::debug!( + "agent_policy_proposals_enabled is false at startup; skipping skill install" + ); } } From e065e9870f82a493555c47aba3338365e7d559dd Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 16:42:17 +0300 Subject: [PATCH 46/49] refactor(supervisor-network): pull DenialEvent down from core DenialEvent is only emitted and consumed inside openshell-supervisor-network (proxy, bypass monitor, denial aggregator). It never crossed the leaf boundary, so the earlier lift to openshell-core was speculative. Move it back into the network crate where its only callers live. Signed-off-by: Radoslav Hubenov --- crates/openshell-core/src/lib.rs | 2 -- crates/openshell-supervisor-network/src/bypass_monitor.rs | 2 +- .../src/denial.rs | 4 +--- crates/openshell-supervisor-network/src/denial_aggregator.rs | 2 +- crates/openshell-supervisor-network/src/lib.rs | 1 + crates/openshell-supervisor-network/src/proxy.rs | 2 +- 6 files changed, 5 insertions(+), 8 deletions(-) rename crates/{openshell-core => openshell-supervisor-network}/src/denial.rs (84%) diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index b189b60c9..266c28f72 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -11,7 +11,6 @@ pub mod auth; pub mod config; -pub mod denial; pub mod driver_utils; pub mod error; pub mod forward; @@ -39,7 +38,6 @@ pub use config::{ ComputeDriverKind, Config, GatewayAuthConfig, GatewayJwtConfig, MtlsAuthConfig, OidcConfig, TlsConfig, }; -pub use denial::DenialEvent; pub use error::{ComputeDriverError, Error, Result}; pub use metadata::{GetResourceVersion, ObjectId, ObjectLabels, ObjectName, SetResourceVersion}; diff --git a/crates/openshell-supervisor-network/src/bypass_monitor.rs b/crates/openshell-supervisor-network/src/bypass_monitor.rs index aadafda9c..9147c5ba0 100644 --- a/crates/openshell-supervisor-network/src/bypass_monitor.rs +++ b/crates/openshell-supervisor-network/src/bypass_monitor.rs @@ -16,7 +16,7 @@ //! the monitor logs a one-time warning and returns. The nftables reject rules //! still provide fast-fail UX — the monitor only adds diagnostic visibility. -use openshell_core::DenialEvent; +use crate::denial::DenialEvent; use openshell_ocsf::{ ActionId, ActivityId, ConfidenceId, DetectionFindingBuilder, DispositionId, Endpoint, FindingInfo, NetworkActivityBuilder, Process, SeverityId, ocsf_emit, diff --git a/crates/openshell-core/src/denial.rs b/crates/openshell-supervisor-network/src/denial.rs similarity index 84% rename from crates/openshell-core/src/denial.rs rename to crates/openshell-supervisor-network/src/denial.rs index 4f610f6e3..ac94b3725 100644 --- a/crates/openshell-core/src/denial.rs +++ b/crates/openshell-supervisor-network/src/denial.rs @@ -6,9 +6,7 @@ //! `DenialEvent` is emitted by the supervisor's networking proxy (on L4/L7 //! deny) and by the bypass monitor (on direct-connect attempts that bypass //! the proxy). It is consumed by the networking-side denial aggregator that -//! deduplicates and flushes summaries to the gateway. The type lives in -//! `openshell-core` so that the eventual networking and process supervisor -//! crates can both reference it without depending on each other. +//! deduplicates and flushes summaries to the gateway. /// A single denial event emitted by the proxy or the bypass monitor. #[derive(Debug, Clone)] diff --git a/crates/openshell-supervisor-network/src/denial_aggregator.rs b/crates/openshell-supervisor-network/src/denial_aggregator.rs index c954ede30..2c8ceb4d3 100644 --- a/crates/openshell-supervisor-network/src/denial_aggregator.rs +++ b/crates/openshell-supervisor-network/src/denial_aggregator.rs @@ -14,7 +14,7 @@ use std::future::Future; use tokio::sync::mpsc; use tracing::debug; -use openshell_core::DenialEvent; +use crate::denial::DenialEvent; /// Aggregated denial summary keyed by `(host, port, binary)`. #[derive(Debug, Clone)] diff --git a/crates/openshell-supervisor-network/src/lib.rs b/crates/openshell-supervisor-network/src/lib.rs index 8d50e767e..141d94353 100644 --- a/crates/openshell-supervisor-network/src/lib.rs +++ b/crates/openshell-supervisor-network/src/lib.rs @@ -8,6 +8,7 @@ //! follow-up commits as modules migrate out of `openshell-sandbox`. pub mod bypass_monitor; +pub mod denial; pub mod denial_aggregator; pub mod identity; pub mod inference_routes; diff --git a/crates/openshell-supervisor-network/src/proxy.rs b/crates/openshell-supervisor-network/src/proxy.rs index 499386879..8d23769af 100644 --- a/crates/openshell-supervisor-network/src/proxy.rs +++ b/crates/openshell-supervisor-network/src/proxy.rs @@ -8,7 +8,7 @@ use crate::l7::tls::ProxyTlsState; use crate::opa::{NetworkAction, OpaEngine, PolicyGenerationGuard}; use crate::policy_local::{POLICY_LOCAL_HOST, PolicyLocalContext}; use miette::{IntoDiagnostic, Result}; -use openshell_core::DenialEvent; +use crate::denial::DenialEvent; use openshell_core::net::{is_always_blocked_ip, is_internal_ip, is_link_local_ip}; use openshell_core::policy::ProxyPolicy; use openshell_core::provider_credentials::ProviderCredentialState; From 33e00fd4582d6d2e830c72c94a85c5b4a5f4ff93 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Mon, 1 Jun 2026 16:51:43 +0300 Subject: [PATCH 47/49] refactor(supervisor-network): pull procfs down from core procfs was lifted to openshell-core under the assumption it would be shared cross-leaf, but on the current branch all three callers (bypass_monitor, identity, proxy) live in openshell-supervisor-network. No file in openshell-supervisor-process imports it. Move the module to the network crate and drop sha2/hex from openshell-core, which were pulled in only for procfs. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 2 -- crates/openshell-core/Cargo.toml | 2 -- crates/openshell-core/src/lib.rs | 1 - crates/openshell-supervisor-network/src/bypass_monitor.rs | 2 +- crates/openshell-supervisor-network/src/identity.rs | 4 ++-- crates/openshell-supervisor-network/src/lib.rs | 1 + .../src/procfs.rs | 0 crates/openshell-supervisor-network/src/proxy.rs | 8 ++++---- 8 files changed, 8 insertions(+), 12 deletions(-) rename crates/{openshell-core => openshell-supervisor-network}/src/procfs.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index bcec1f8b2..ac7fed313 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3458,7 +3458,6 @@ name = "openshell-core" version = "0.0.0" dependencies = [ "base64 0.22.1", - "hex", "ipnet", "libc", "miette", @@ -3469,7 +3468,6 @@ dependencies = [ "protobuf-src", "serde", "serde_json", - "sha2 0.10.9", "tempfile", "thiserror 2.0.18", "tokio", diff --git a/crates/openshell-core/Cargo.toml b/crates/openshell-core/Cargo.toml index 984ffcf06..784546f3f 100644 --- a/crates/openshell-core/Cargo.toml +++ b/crates/openshell-core/Cargo.toml @@ -22,8 +22,6 @@ serde_json = { workspace = true } tracing = { workspace = true } url = { workspace = true } ipnet = "2" -hex = "0.4" -sha2 = { workspace = true } base64 = { workspace = true } [target.'cfg(target_os = "linux")'.dependencies] diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index 266c28f72..9a598633f 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -24,7 +24,6 @@ pub mod net; pub mod netns; pub mod paths; pub mod policy; -pub mod procfs; pub mod progress; pub mod proposals; pub mod proto; diff --git a/crates/openshell-supervisor-network/src/bypass_monitor.rs b/crates/openshell-supervisor-network/src/bypass_monitor.rs index 9147c5ba0..2852fc5e1 100644 --- a/crates/openshell-supervisor-network/src/bypass_monitor.rs +++ b/crates/openshell-supervisor-network/src/bypass_monitor.rs @@ -295,7 +295,7 @@ pub fn spawn( fn resolve_process_identity(entrypoint_pid: u32, src_port: u16) -> (String, String, String) { #[cfg(target_os = "linux")] { - use openshell_core::procfs; + use crate::procfs; match procfs::resolve_tcp_peer_socket_owners(entrypoint_pid, src_port) { Ok(socket_owners) => { diff --git a/crates/openshell-supervisor-network/src/identity.rs b/crates/openshell-supervisor-network/src/identity.rs index 7f5d467d8..f687d7314 100644 --- a/crates/openshell-supervisor-network/src/identity.rs +++ b/crates/openshell-supervisor-network/src/identity.rs @@ -9,7 +9,7 @@ //! mid-sandbox and the request is denied. use miette::Result; -use openshell_core::procfs; +use crate::procfs; use std::collections::HashMap; use std::fs::Metadata; #[cfg(unix)] @@ -175,7 +175,7 @@ impl BinaryIdentityCache { #[cfg(test)] mod tests { use super::*; - use openshell_core::procfs; + use crate::procfs; use std::io::Write; use std::time::Duration; diff --git a/crates/openshell-supervisor-network/src/lib.rs b/crates/openshell-supervisor-network/src/lib.rs index 141d94353..ddb525238 100644 --- a/crates/openshell-supervisor-network/src/lib.rs +++ b/crates/openshell-supervisor-network/src/lib.rs @@ -16,5 +16,6 @@ pub mod l7; pub mod mechanistic_mapper; pub mod opa; pub mod policy_local; +pub mod procfs; pub mod proxy; pub mod run; diff --git a/crates/openshell-core/src/procfs.rs b/crates/openshell-supervisor-network/src/procfs.rs similarity index 100% rename from crates/openshell-core/src/procfs.rs rename to crates/openshell-supervisor-network/src/procfs.rs diff --git a/crates/openshell-supervisor-network/src/proxy.rs b/crates/openshell-supervisor-network/src/proxy.rs index 8d23769af..be08000f5 100644 --- a/crates/openshell-supervisor-network/src/proxy.rs +++ b/crates/openshell-supervisor-network/src/proxy.rs @@ -1178,7 +1178,7 @@ fn resolve_owner_identity( entrypoint_pid: u32, identity_cache: &BinaryIdentityCache, ) -> std::result::Result { - let bin_path = openshell_core::procfs::binary_path(owner_pid.cast_signed()).map_err(|e| { + let bin_path = crate::procfs::binary_path(owner_pid.cast_signed()).map_err(|e| { IdentityError { reason: format!("failed to resolve peer binary for PID {owner_pid}: {e}"), binary: None, @@ -1196,7 +1196,7 @@ fn resolve_owner_identity( ancestors: vec![], })?; - let ancestors = openshell_core::procfs::collect_ancestor_binaries(owner_pid, entrypoint_pid); + let ancestors = crate::procfs::collect_ancestor_binaries(owner_pid, entrypoint_pid); for ancestor in &ancestors { identity_cache @@ -1215,7 +1215,7 @@ fn resolve_owner_identity( let mut exclude = ancestors.clone(); exclude.push(bin_path.clone()); let cmdline_paths = - openshell_core::procfs::collect_cmdline_paths(owner_pid, entrypoint_pid, &exclude); + crate::procfs::collect_cmdline_paths(owner_pid, entrypoint_pid, &exclude); Ok(ResolvedIdentity { bin_path, @@ -1246,7 +1246,7 @@ fn resolve_process_identity( identity_cache: &BinaryIdentityCache, ) -> std::result::Result { let socket_owners = - openshell_core::procfs::resolve_tcp_peer_socket_owners(entrypoint_pid, peer_port).map_err( + crate::procfs::resolve_tcp_peer_socket_owners(entrypoint_pid, peer_port).map_err( |e| IdentityError { reason: format!("failed to resolve peer binary: {e}"), binary: None, From bbefa4555571834c942e91e2e9885387e6c7c81f Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Tue, 2 Jun 2026 09:20:46 +0300 Subject: [PATCH 48/49] style(supervisor-network): run cargo fmt Signed-off-by: Radoslav Hubenov --- .../src/identity.rs | 2 +- .../openshell-supervisor-network/src/proxy.rs | 28 ++++++++----------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/crates/openshell-supervisor-network/src/identity.rs b/crates/openshell-supervisor-network/src/identity.rs index f687d7314..fce568f41 100644 --- a/crates/openshell-supervisor-network/src/identity.rs +++ b/crates/openshell-supervisor-network/src/identity.rs @@ -8,8 +8,8 @@ //! path must match the cached hash. A mismatch indicates the binary was replaced //! mid-sandbox and the request is denied. -use miette::Result; use crate::procfs; +use miette::Result; use std::collections::HashMap; use std::fs::Metadata; #[cfg(unix)] diff --git a/crates/openshell-supervisor-network/src/proxy.rs b/crates/openshell-supervisor-network/src/proxy.rs index be08000f5..c1210cb8e 100644 --- a/crates/openshell-supervisor-network/src/proxy.rs +++ b/crates/openshell-supervisor-network/src/proxy.rs @@ -3,12 +3,12 @@ //! HTTP CONNECT proxy with OPA policy evaluation and process-identity binding. +use crate::denial::DenialEvent; use crate::identity::BinaryIdentityCache; use crate::l7::tls::ProxyTlsState; use crate::opa::{NetworkAction, OpaEngine, PolicyGenerationGuard}; use crate::policy_local::{POLICY_LOCAL_HOST, PolicyLocalContext}; use miette::{IntoDiagnostic, Result}; -use crate::denial::DenialEvent; use openshell_core::net::{is_always_blocked_ip, is_internal_ip, is_link_local_ip}; use openshell_core::policy::ProxyPolicy; use openshell_core::provider_credentials::ProviderCredentialState; @@ -1178,14 +1178,13 @@ fn resolve_owner_identity( entrypoint_pid: u32, identity_cache: &BinaryIdentityCache, ) -> std::result::Result { - let bin_path = crate::procfs::binary_path(owner_pid.cast_signed()).map_err(|e| { - IdentityError { + let bin_path = + crate::procfs::binary_path(owner_pid.cast_signed()).map_err(|e| IdentityError { reason: format!("failed to resolve peer binary for PID {owner_pid}: {e}"), binary: None, binary_pid: Some(owner_pid), ancestors: vec![], - } - })?; + })?; let bin_hash = identity_cache .verify_or_cache(&bin_path) @@ -1214,8 +1213,7 @@ fn resolve_owner_identity( let mut exclude = ancestors.clone(); exclude.push(bin_path.clone()); - let cmdline_paths = - crate::procfs::collect_cmdline_paths(owner_pid, entrypoint_pid, &exclude); + let cmdline_paths = crate::procfs::collect_cmdline_paths(owner_pid, entrypoint_pid, &exclude); Ok(ResolvedIdentity { bin_path, @@ -1245,15 +1243,13 @@ fn resolve_process_identity( peer_port: u16, identity_cache: &BinaryIdentityCache, ) -> std::result::Result { - let socket_owners = - crate::procfs::resolve_tcp_peer_socket_owners(entrypoint_pid, peer_port).map_err( - |e| IdentityError { - reason: format!("failed to resolve peer binary: {e}"), - binary: None, - binary_pid: None, - ancestors: vec![], - }, - )?; + let socket_owners = crate::procfs::resolve_tcp_peer_socket_owners(entrypoint_pid, peer_port) + .map_err(|e| IdentityError { + reason: format!("failed to resolve peer binary: {e}"), + binary: None, + binary_pid: None, + ancestors: vec![], + })?; let mut identities = Vec::with_capacity(socket_owners.owners.len()); for owner in &socket_owners.owners { From 7a3fdd7bd35f21e92dd07a6ec5f6f315ad339b66 Mon Sep 17 00:00:00 2001 From: Radoslav Hubenov Date: Tue, 2 Jun 2026 11:04:04 +0300 Subject: [PATCH 49/49] fix(supervisor-network): add libc dev-dependency for procfs tests The procfs/bypass_monitor/proxy test modules use libc::{fork, exec, fcntl, kill, waitpid} but the dep wasn't declared in this crate's Cargo.toml. It was previously satisfied transitively when these modules lived in openshell-core; the move left the test target unable to resolve libc. Signed-off-by: Radoslav Hubenov --- Cargo.lock | 1 + crates/openshell-supervisor-network/Cargo.toml | 3 +++ 2 files changed, 4 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index ac7fed313..51a3e1d59 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3780,6 +3780,7 @@ dependencies = [ "glob", "hex", "ipnet", + "libc", "miette", "openshell-core", "openshell-ocsf", diff --git a/crates/openshell-supervisor-network/Cargo.toml b/crates/openshell-supervisor-network/Cargo.toml index 0eca09d1a..44db67983 100644 --- a/crates/openshell-supervisor-network/Cargo.toml +++ b/crates/openshell-supervisor-network/Cargo.toml @@ -48,5 +48,8 @@ temp-env = "0.3" tokio-tungstenite = { workspace = true } futures = { workspace = true } +[target.'cfg(unix)'.dev-dependencies] +libc = "0.2" + [lints] workspace = true