diff --git a/Cargo.lock b/Cargo.lock index ad7efabc9..51a3e1d59 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3457,8 +3457,12 @@ dependencies = [ name = "openshell-core" version = "0.0.0" dependencies = [ + "base64 0.22.1", "ipnet", + "libc", "miette", + "nix", + "openshell-ocsf", "prost", "prost-types", "protobuf-src", @@ -3466,9 +3470,12 @@ dependencies = [ "serde_json", "tempfile", "thiserror 2.0.18", + "tokio", "tonic", "tonic-build", + "tracing", "url", + "uuid", ] [[package]] @@ -3648,22 +3655,20 @@ dependencies = [ "hex", "hmac", "ipnet", - "landlock", - "libc", "miette", "nix", "openshell-core", "openshell-ocsf", "openshell-policy", "openshell-router", + "openshell-supervisor-network", + "openshell-supervisor-process", "rand_core 0.6.4", "rcgen", "regorus", "russh", - "rustix 1.1.4", "rustls", "rustls-pemfile", - "seccompiler", "serde", "serde_json", "serde_yml", @@ -3763,6 +3768,73 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "openshell-supervisor-network" +version = "0.0.0" +dependencies = [ + "apollo-parser", + "base64 0.22.1", + "bytes", + "flate2", + "futures", + "glob", + "hex", + "ipnet", + "libc", + "miette", + "openshell-core", + "openshell-ocsf", + "openshell-policy", + "openshell-router", + "rcgen", + "regorus", + "rustls", + "rustls-pemfile", + "serde", + "serde_json", + "serde_yml", + "sha1 0.10.6", + "sha2 0.10.9", + "temp-env", + "tempfile", + "thiserror 2.0.18", + "tokio", + "tokio-rustls", + "tokio-tungstenite 0.26.2", + "tracing", + "url", + "uuid", + "webpki-roots 1.0.7", +] + +[[package]] +name = "openshell-supervisor-process" +version = "0.0.0" +dependencies = [ + "anyhow", + "base64 0.22.1", + "hex", + "landlock", + "libc", + "miette", + "nix", + "openshell-core", + "openshell-ocsf", + "rand_core 0.6.4", + "russh", + "rustix 1.1.4", + "seccompiler", + "serde_json", + "sha2 0.10.9", + "tempfile", + "tokio", + "tokio-stream", + "tonic", + "tracing", + "tracing-subscriber", + "uuid", +] + [[package]] name = "openshell-tui" version = "0.0.0" diff --git a/crates/openshell-core/Cargo.toml b/crates/openshell-core/Cargo.toml index b03fb1494..784546f3f 100644 --- a/crates/openshell-core/Cargo.toml +++ b/crates/openshell-core/Cargo.toml @@ -13,19 +13,33 @@ repository.workspace = true [dependencies] prost = { workspace = true } prost-types = { workspace = true } -tonic = { workspace = true } +tonic = { workspace = true, features = ["channel", "tls"] } +tokio = { workspace = true } thiserror = { workspace = true } miette = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } +tracing = { workspace = true } url = { workspace = true } ipnet = "2" +base64 = { workspace = true } + +[target.'cfg(target_os = "linux")'.dependencies] +openshell-ocsf = { path = "../openshell-ocsf" } +uuid = { workspace = true } +libc = "0.2" +tempfile = "3" +nix = { workspace = true } [features] ## Include test-only settings (dummy_bool, dummy_int) in the registry. ## Off by default so production builds have an empty registry. ## Enabled by e2e tests and during development. dev-settings = [] +## Expose proposals::test_helpers (`ProposalsFlagGuard`) to downstream test +## code in other crates. Enabled by openshell-sandbox and +## openshell-supervisor-network dev builds. +test-helpers = [] [build-dependencies] tonic-build = { workspace = true } diff --git a/crates/openshell-sandbox/src/grpc_client.rs b/crates/openshell-core/src/grpc_client.rs similarity index 99% rename from crates/openshell-sandbox/src/grpc_client.rs rename to crates/openshell-core/src/grpc_client.rs index 14a6808c1..b68137833 100644 --- a/crates/openshell-sandbox/src/grpc_client.rs +++ b/crates/openshell-core/src/grpc_client.rs @@ -22,15 +22,15 @@ use std::collections::HashMap; use std::sync::{Arc, OnceLock, RwLock}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use miette::{IntoDiagnostic, Result, WrapErr}; -use openshell_core::proto::{ +use crate::proto::{ DenialSummary, GetDraftPolicyRequest, GetInferenceBundleRequest, GetInferenceBundleResponse, GetSandboxConfigRequest, GetSandboxProviderEnvironmentRequest, IssueSandboxTokenRequest, PolicyChunk, PolicySource, PolicyStatus, RefreshSandboxTokenRequest, ReportPolicyStatusRequest, SandboxPolicy as ProtoSandboxPolicy, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, UpdateConfigRequest, inference_client::InferenceClient, open_shell_client::OpenShellClient, }; -use openshell_core::sandbox_env; +use crate::sandbox_env; +use miette::{IntoDiagnostic, Result, WrapErr}; use tonic::Status; use tonic::metadata::AsciiMetadataValue; use tonic::service::interceptor::InterceptedService; @@ -674,7 +674,7 @@ pub struct SettingsPollResult { pub config_revision: u64, pub policy_source: PolicySource, /// Effective settings keyed by name. - pub settings: HashMap, + pub settings: HashMap, /// When `policy_source` is `Global`, the version of the global policy revision. pub global_policy_version: u32, pub provider_env_revision: u64, diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index 17548ad1a..9a598633f 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -15,14 +15,21 @@ pub mod driver_utils; pub mod error; pub mod forward; pub mod gpu; +pub mod grpc_client; pub mod image; pub mod inference; pub mod metadata; pub mod net; +#[cfg(target_os = "linux")] +pub mod netns; pub mod paths; +pub mod policy; pub mod progress; +pub mod proposals; pub mod proto; +pub mod provider_credentials; pub mod sandbox_env; +pub mod secrets; pub mod settings; pub mod time; diff --git a/crates/openshell-sandbox/src/sandbox/linux/netns.rs b/crates/openshell-core/src/netns/mod.rs similarity index 96% rename from crates/openshell-sandbox/src/sandbox/linux/netns.rs rename to crates/openshell-core/src/netns/mod.rs index 433f70b1c..bad44ac89 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/netns.rs +++ b/crates/openshell-core/src/netns/mod.rs @@ -7,6 +7,8 @@ //! the sandbox to the host. This ensures the sandboxed process can only //! communicate through the proxy running on the host side of the veth. +mod nft_ruleset; + use miette::{IntoDiagnostic, Result}; use std::net::IpAddr; use std::os::unix::io::RawFd; @@ -71,7 +73,7 @@ impl NetworkNamespace { .unwrap(); openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Informational) .status(openshell_ocsf::StatusId::Success) .state(openshell_ocsf::StateId::Enabled, "creating") @@ -165,7 +167,7 @@ impl NetworkNamespace { }; openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Informational) .status(openshell_ocsf::StatusId::Success) .state(openshell_ocsf::StateId::Enabled, "created") @@ -262,7 +264,7 @@ impl NetworkNamespace { pub fn install_bypass_rules(&self, proxy_port: u16) -> Result<()> { let Some(nft_path) = find_nft() else { openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Medium) .status(openshell_ocsf::StatusId::Failure) .state(openshell_ocsf::StateId::Disabled, "degraded") @@ -287,15 +289,12 @@ impl NetworkNamespace { // before reject rules in the chain so packets are logged before being // rejected. If the kernel lacks nft_log support, fall back to the // reject-only ruleset. - let ruleset_with_log = super::nft_ruleset::generate_bypass_ruleset( - &host_ip_str, - proxy_port, - Some(&log_prefix), - ); + let ruleset_with_log = + nft_ruleset::generate_bypass_ruleset(&host_ip_str, proxy_port, Some(&log_prefix)); if let Err(e) = run_nft_netns(&self.name, &nft_path, &ruleset_with_log) { openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Low) .status(openshell_ocsf::StatusId::Failure) .state(openshell_ocsf::StateId::Other, "degraded") @@ -307,11 +306,11 @@ impl NetworkNamespace { ); let ruleset_no_log = - super::nft_ruleset::generate_bypass_ruleset(&host_ip_str, proxy_port, None); + nft_ruleset::generate_bypass_ruleset(&host_ip_str, proxy_port, None); if let Err(e) = run_nft_netns(&self.name, &nft_path, &ruleset_no_log) { openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Medium) .status(openshell_ocsf::StatusId::Failure) .state(openshell_ocsf::StateId::Disabled, "failed") @@ -326,7 +325,7 @@ impl NetworkNamespace { } openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Informational) .status(openshell_ocsf::StatusId::Success) .state(openshell_ocsf::StateId::Enabled, "installed") @@ -369,7 +368,7 @@ impl Drop for NetworkNamespace { } openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Informational) .status(openshell_ocsf::StatusId::Success) .state(openshell_ocsf::StateId::Disabled, "cleaned_up") diff --git a/crates/openshell-sandbox/src/sandbox/linux/nft_ruleset.rs b/crates/openshell-core/src/netns/nft_ruleset.rs similarity index 100% rename from crates/openshell-sandbox/src/sandbox/linux/nft_ruleset.rs rename to crates/openshell-core/src/netns/nft_ruleset.rs diff --git a/crates/openshell-core/src/paths.rs b/crates/openshell-core/src/paths.rs index 65000c6cf..9445347c7 100644 --- a/crates/openshell-core/src/paths.rs +++ b/crates/openshell-core/src/paths.rs @@ -1,12 +1,14 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! Centralized XDG config directory resolution and permission helpers. +//! Path utilities: XDG config directory resolution, permission helpers, and +//! lexical path normalization. //! //! All `OpenShell` crates should use [`xdg_config_dir`] from this module instead //! of reimplementing the XDG lookup. The permission helpers ensure that //! sensitive files (private keys, tokens) and the directories containing them -//! are created with restrictive modes. +//! are created with restrictive modes. [`normalize_path`] performs purely +//! lexical normalization (no filesystem access, no symlink resolution). use miette::{IntoDiagnostic, Result, WrapErr}; use std::path::{Path, PathBuf}; @@ -126,6 +128,33 @@ pub fn is_file_permissions_too_open(path: &Path) -> bool { std::fs::metadata(path).is_ok_and(|m| m.permissions().mode() & 0o077 != 0) } +/// Normalize a filesystem path by collapsing redundant separators +/// and removing trailing slashes, without requiring the path to exist on disk. +/// +/// This is a lexical normalization only — it does NOT resolve symlinks or +/// check the filesystem. `..` components are preserved verbatim; callers that +/// need to reject parent traversal must validate separately. +pub fn normalize_path(path: &str) -> String { + use std::path::Component; + + let p = Path::new(path); + let mut normalized = PathBuf::new(); + for component in p.components() { + match component { + Component::Prefix(prefix) => normalized.push(prefix.as_os_str()), + #[allow(clippy::path_buf_push_overwrite)] + Component::RootDir => normalized.push("/"), + Component::CurDir => {} // skip "." + Component::ParentDir => { + // Keep ".." — validation will catch it separately + normalized.push(".."); + } + Component::Normal(c) => normalized.push(c), + } + } + normalized.to_string_lossy().to_string() +} + #[cfg(test)] mod tests { use super::*; @@ -201,4 +230,17 @@ mod tests { std::fs::set_permissions(&file, std::fs::Permissions::from_mode(0o600)).unwrap(); assert!(!is_file_permissions_too_open(&file)); } + + #[test] + fn normalize_path_collapses_separators() { + assert_eq!(normalize_path("/usr//lib"), "/usr/lib"); + assert_eq!(normalize_path("/usr/./lib"), "/usr/lib"); + assert_eq!(normalize_path("/tmp/"), "/tmp"); + } + + #[test] + fn normalize_path_preserves_parent_dir() { + // normalize_path preserves ".." — validation catches it separately + assert_eq!(normalize_path("/usr/../etc"), "/usr/../etc"); + } } diff --git a/crates/openshell-sandbox/src/policy.rs b/crates/openshell-core/src/policy.rs similarity index 95% rename from crates/openshell-sandbox/src/policy.rs rename to crates/openshell-core/src/policy.rs index 0827fa0d0..1645b9da4 100644 --- a/crates/openshell-sandbox/src/policy.rs +++ b/crates/openshell-core/src/policy.rs @@ -3,7 +3,8 @@ //! Sandbox policy configuration. -use openshell_core::proto::{ +use crate::paths::normalize_path; +use crate::proto::{ FilesystemPolicy as ProtoFilesystemPolicy, LandlockPolicy as ProtoLandlockPolicy, ProcessPolicy as ProtoProcessPolicy, SandboxPolicy as ProtoSandboxPolicy, }; @@ -125,12 +126,12 @@ impl From for FilesystemPolicy { read_only: proto .read_only .into_iter() - .map(|p| PathBuf::from(openshell_policy::normalize_path(&p))) + .map(|p| PathBuf::from(normalize_path(&p))) .collect(), read_write: proto .read_write .into_iter() - .map(|p| PathBuf::from(openshell_policy::normalize_path(&p))) + .map(|p| PathBuf::from(normalize_path(&p))) .collect(), include_workdir: proto.include_workdir, } diff --git a/crates/openshell-core/src/proposals.rs b/crates/openshell-core/src/proposals.rs new file mode 100644 index 000000000..fcb6b110c --- /dev/null +++ b/crates/openshell-core/src/proposals.rs @@ -0,0 +1,83 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Process-wide flag controlling agent-driven policy proposals. +//! +//! Initialised once during sandbox start from the `agent_policy_proposals_enabled` +//! setting and updated by the policy poll loop when the setting changes. Read +//! by the `policy.local` route handler and by the skills installer to gate the +//! agent-controlled mutation surface. Tests use [`test_helpers::ProposalsFlagGuard`] +//! to flip the flag through a serialized guard. + +use std::sync::Arc; +use std::sync::OnceLock; +use std::sync::atomic::{AtomicBool, Ordering}; + +/// Process-wide handle to the agent-proposals flag. +/// +/// Set once by `run_sandbox()` during start; subsequent attempts to set it are +/// ignored. The contained `AtomicBool` is updated by the policy poll loop. +pub static AGENT_PROPOSALS_ENABLED: OnceLock> = OnceLock::new(); + +/// Read the current value of the agent proposals feature flag. +/// +/// Returns `false` if the flag has not been initialized (e.g. during unit +/// tests), matching the documented default for the setting. +pub fn agent_proposals_enabled() -> bool { + AGENT_PROPOSALS_ENABLED + .get() + .is_some_and(|flag| flag.load(Ordering::Relaxed)) +} + +/// Test-only helpers shared across crates' test modules. +#[cfg(any(test, feature = "test-helpers"))] +pub mod test_helpers { + use std::sync::Arc; + use std::sync::LazyLock; + use std::sync::atomic::{AtomicBool, Ordering}; + use tokio::sync::MutexGuard; + + static PROPOSALS_FLAG_LOCK: LazyLock> = + LazyLock::new(|| tokio::sync::Mutex::new(())); + + /// Guard for tests that toggle the process-wide flag. + /// + /// Acquires a process-wide async mutex, swaps in the requested value, and + /// restores the previous value on drop. Hold the guard for the duration of + /// any code that reads `agent_proposals_enabled()`. + pub struct ProposalsFlagGuard { + prev: bool, + flag: Arc, + _lock: MutexGuard<'static, ()>, + } + + impl ProposalsFlagGuard { + pub async fn set(enabled: bool) -> Self { + let lock = PROPOSALS_FLAG_LOCK.lock().await; + Self::with_lock(enabled, lock) + } + + pub fn set_blocking(enabled: bool) -> Self { + let lock = PROPOSALS_FLAG_LOCK.blocking_lock(); + Self::with_lock(enabled, lock) + } + + fn with_lock(enabled: bool, lock: MutexGuard<'static, ()>) -> Self { + let flag = super::AGENT_PROPOSALS_ENABLED + .get_or_init(|| Arc::new(AtomicBool::new(false))) + .clone(); + let prev = flag.swap(enabled, Ordering::Relaxed); + Self { + prev, + flag, + _lock: lock, + } + } + } + + impl Drop for ProposalsFlagGuard { + fn drop(&mut self) { + self.flag.store(self.prev, Ordering::Relaxed); + } + } +} diff --git a/crates/openshell-sandbox/src/provider_credentials.rs b/crates/openshell-core/src/provider_credentials.rs similarity index 100% rename from crates/openshell-sandbox/src/provider_credentials.rs rename to crates/openshell-core/src/provider_credentials.rs diff --git a/crates/openshell-sandbox/src/secrets.rs b/crates/openshell-core/src/secrets.rs similarity index 99% rename from crates/openshell-sandbox/src/secrets.rs rename to crates/openshell-core/src/secrets.rs index de7804393..42fefb944 100644 --- a/crates/openshell-sandbox/src/secrets.rs +++ b/crates/openshell-core/src/secrets.rs @@ -117,13 +117,13 @@ impl fmt::Debug for SecretResolver { impl SecretResolver { #[cfg_attr(not(test), allow(dead_code))] - pub(crate) fn from_provider_env( + pub fn from_provider_env( provider_env: HashMap, ) -> (HashMap, Option) { Self::from_provider_env_for_revision(provider_env, HashMap::new(), 0) } - pub(crate) fn from_provider_env_for_revision( + pub fn from_provider_env_for_revision( provider_env: HashMap, credential_expires_at_ms: HashMap, revision: u64, @@ -136,7 +136,7 @@ impl SecretResolver { ) } - pub(crate) fn from_provider_env_for_current_revision( + pub fn from_provider_env_for_current_revision( provider_env: HashMap, credential_expires_at_ms: HashMap, revision: u64, @@ -201,7 +201,7 @@ impl SecretResolver { (child_env, Some(Self { by_placeholder })) } - pub(crate) fn merge<'a>(resolvers: impl IntoIterator) -> Option { + pub fn merge<'a>(resolvers: impl IntoIterator) -> Option { let mut by_placeholder = HashMap::new(); for resolver in resolvers { by_placeholder.extend(resolver.by_placeholder.clone()); @@ -217,7 +217,7 @@ impl SecretResolver { /// /// Returns `None` if the placeholder is unknown or the resolved value /// contains prohibited control characters (CRLF, null byte). - pub(crate) fn resolve_placeholder(&self, value: &str) -> Option<&str> { + pub fn resolve_placeholder(&self, value: &str) -> Option<&str> { let secret = if let Some(secret) = self.by_placeholder.get(value) { secret } else { @@ -245,7 +245,7 @@ impl SecretResolver { } } - pub(crate) fn rewrite_header_value( + pub fn rewrite_header_value( &self, value: &str, ) -> Result, UnresolvedPlaceholderError> { @@ -287,7 +287,7 @@ impl SecretResolver { Ok(None) } - pub(crate) fn rewrite_text_placeholders( + pub fn rewrite_text_placeholders( &self, text: &mut String, location: &'static str, @@ -352,7 +352,7 @@ impl SecretResolver { /// The message is mutated only after all placeholders resolve /// successfully. The return value is the number of replacements; callers /// must not log the rewritten text. - pub(crate) fn rewrite_websocket_text_placeholders( + pub fn rewrite_websocket_text_placeholders( &self, text: &mut String, ) -> Result { diff --git a/crates/openshell-ocsf/src/ctx.rs b/crates/openshell-ocsf/src/ctx.rs new file mode 100644 index 000000000..6916c5521 --- /dev/null +++ b/crates/openshell-ocsf/src/ctx.rs @@ -0,0 +1,41 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Process-wide [`SandboxContext`] singleton. +//! +//! Initialised once via [`set_ctx`] during sandbox start; read by every event +//! builder via [`ctx`]. Falls back to a default context when the singleton has +//! not been set (e.g. unit tests that exercise builders without booting the +//! sandbox). + +use crate::SandboxContext; +use std::sync::{LazyLock, OnceLock}; + +static OCSF_CTX: OnceLock = OnceLock::new(); + +static OCSF_CTX_FALLBACK: LazyLock = LazyLock::new(|| SandboxContext { + sandbox_id: String::new(), + sandbox_name: String::new(), + container_image: String::new(), + hostname: "test".to_string(), + product_version: env!("CARGO_PKG_VERSION").to_string(), + proxy_ip: std::net::IpAddr::from([127, 0, 0, 1]), + proxy_port: 3128, +}); + +/// Initialise the process-wide OCSF sandbox context. +/// +/// Returns `false` if the context was already set; the caller may log and +/// continue. Intended to be called exactly once during sandbox startup. +pub fn set_ctx(ctx: SandboxContext) -> bool { + OCSF_CTX.set(ctx).is_ok() +} + +/// Return a reference to the process-wide [`SandboxContext`]. +/// +/// Falls back to a default context if [`set_ctx`] has not been called (e.g. +/// during unit tests that exercise individual builders). +#[must_use] +pub fn ctx() -> &'static SandboxContext { + OCSF_CTX.get().unwrap_or(&OCSF_CTX_FALLBACK) +} diff --git a/crates/openshell-ocsf/src/lib.rs b/crates/openshell-ocsf/src/lib.rs index b9000afcf..e9d1402a6 100644 --- a/crates/openshell-ocsf/src/lib.rs +++ b/crates/openshell-ocsf/src/lib.rs @@ -25,6 +25,7 @@ pub const OCSF_VERSION: &str = "1.7.0"; pub mod builders; +pub mod ctx; pub mod enums; pub mod events; pub mod format; diff --git a/crates/openshell-policy/src/lib.rs b/crates/openshell-policy/src/lib.rs index 26c8fc9d3..aaabbf926 100644 --- a/crates/openshell-policy/src/lib.rs +++ b/crates/openshell-policy/src/lib.rs @@ -858,26 +858,10 @@ fn truncate_for_display(s: &str) -> String { /// /// This is a lexical normalization only — it does NOT resolve symlinks or /// check the filesystem. -pub fn normalize_path(path: &str) -> String { - use std::path::Component; - - let p = Path::new(path); - let mut normalized = std::path::PathBuf::new(); - for component in p.components() { - match component { - Component::Prefix(prefix) => normalized.push(prefix.as_os_str()), - #[allow(clippy::path_buf_push_overwrite)] - Component::RootDir => normalized.push("/"), - Component::CurDir => {} // skip "." - Component::ParentDir => { - // Keep ".." — validation will catch it separately - normalized.push(".."); - } - Component::Normal(c) => normalized.push(c), - } - } - normalized.to_string_lossy().to_string() -} +/// +/// Re-exported from `openshell-core` so existing call sites +/// (`openshell_policy::normalize_path`) keep resolving. +pub use openshell_core::paths::normalize_path; // --------------------------------------------------------------------------- // Tests diff --git a/crates/openshell-sandbox/Cargo.toml b/crates/openshell-sandbox/Cargo.toml index 6d527bc53..aacfb528a 100644 --- a/crates/openshell-sandbox/Cargo.toml +++ b/crates/openshell-sandbox/Cargo.toml @@ -19,6 +19,8 @@ openshell-core = { path = "../openshell-core" } openshell-ocsf = { path = "../openshell-ocsf" } openshell-policy = { path = "../openshell-policy" } openshell-router = { path = "../openshell-router" } +openshell-supervisor-network = { path = "../openshell-supervisor-network" } +openshell-supervisor-process = { path = "../openshell-supervisor-process" } # Async runtime tokio = { workspace = true } @@ -79,17 +81,8 @@ tracing-appender = { workspace = true } # Unix/Process nix = { workspace = true } -[target.'cfg(unix)'.dependencies] -libc = "0.2" -rustix = { workspace = true } - -[target.'cfg(target_os = "linux")'.dependencies] -landlock = "0.4" -seccompiler = "0.5" -tempfile = "3" -uuid = { version = "1", features = ["v4"] } - [dev-dependencies] +openshell-core = { path = "../openshell-core", features = ["test-helpers"] } tempfile = "3" temp-env = "0.3" tokio-tungstenite = { workspace = true } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 4a0e61e57..101eafa3d 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -5,47 +5,16 @@ //! //! This crate provides process sandboxing and monitoring capabilities. -pub mod bypass_monitor; -mod child_env; -pub mod debug_rpc; -pub mod denial_aggregator; -mod grpc_client; -mod identity; -pub mod l7; -pub mod log_push; -pub mod mechanistic_mapper; -pub mod opa; -mod policy; -mod policy_local; -mod process; -pub mod procfs; -mod provider_credentials; -pub mod proxy; -mod sandbox; -mod secrets; -mod skills; -mod ssh; -mod supervisor_session; - -use miette::{IntoDiagnostic, Result}; -#[cfg(target_os = "linux")] -use std::collections::HashSet; +use miette::Result; use std::future::Future; -use std::net::SocketAddr; use std::sync::Arc; -use std::sync::LazyLock; -#[cfg(any(target_os = "linux", test))] -use std::sync::Mutex; -use std::sync::OnceLock; -use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::atomic::AtomicU32; use std::time::Duration; -use tokio::time::timeout; -use tracing::{debug, info, trace, warn}; +use tracing::{debug, info, warn}; use openshell_ocsf::{ ActionId, ActivityId, AppLifecycleBuilder, ConfigStateChangeBuilder, DetectionFindingBuilder, - DispositionId, FindingInfo, LaunchTypeId, Process as OcsfProcess, ProcessActivityBuilder, - SandboxContext, SeverityId, StateId, StatusId, ocsf_emit, + DispositionId, FindingInfo, SandboxContext, SeverityId, StateId, StatusId, ocsf_emit, }; // --------------------------------------------------------------------------- @@ -67,29 +36,12 @@ use openshell_ocsf::{ // policy changes, or observable sandbox behavior worth structuring. // --------------------------------------------------------------------------- -/// Process-wide OCSF sandbox context. Initialized once during `run_sandbox()` -/// startup and accessible from any module in the crate via [`ocsf_ctx()`]. -static OCSF_CTX: OnceLock = OnceLock::new(); - -/// Fallback context used when `OCSF_CTX` has not been initialized (e.g. in -/// unit tests that exercise individual functions without calling `run_sandbox`). -static OCSF_CTX_FALLBACK: LazyLock = LazyLock::new(|| SandboxContext { - sandbox_id: String::new(), - sandbox_name: String::new(), - container_image: String::new(), - hostname: "test".to_string(), - product_version: openshell_core::VERSION.to_string(), - proxy_ip: std::net::IpAddr::from([127, 0, 0, 1]), - proxy_port: 3128, -}); - -/// Return a reference to the process-wide [`SandboxContext`]. +/// Re-export the process-wide OCSF sandbox context getter. /// -/// Falls back to a default context if `run_sandbox()` has not yet been called -/// (e.g. during unit tests). -pub(crate) fn ocsf_ctx() -> &'static SandboxContext { - OCSF_CTX.get().unwrap_or(&OCSF_CTX_FALLBACK) -} +/// The singleton lives in `openshell-ocsf` so both supervisor leaves can +/// reach it without depending on `openshell-sandbox`. Initialised once during +/// `run_sandbox()` startup via `openshell_ocsf::ctx::set_ctx`. +pub(crate) use openshell_ocsf::ctx::ctx as ocsf_ctx; /// Process-wide flag for the agent-driven policy proposal surface. /// Set once during `run_sandbox()` startup and updated by the settings poll @@ -98,189 +50,24 @@ pub(crate) fn ocsf_ctx() -> &'static SandboxContext { /// to gate the agent-controlled mutation surface. Exposed `pub(crate)` so /// unit tests in sibling modules can flip the flag through a serialized /// guard (see `policy_local::tests::ProposalsFlagGuard`). -pub(crate) static AGENT_PROPOSALS_ENABLED: OnceLock> = - OnceLock::new(); - -/// Read the current value of the agent proposals feature flag. -/// -/// Returns `false` if `run_sandbox()` has not initialized the flag (e.g. -/// during unit tests), matching the documented default for the setting. -pub(crate) fn agent_proposals_enabled() -> bool { - AGENT_PROPOSALS_ENABLED - .get() - .is_some_and(|flag| flag.load(Ordering::Relaxed)) -} - -/// Test-only helpers shared across sibling test modules. -#[cfg(test)] -pub(crate) mod test_helpers { - #![allow( - clippy::redundant_pub_crate, - reason = "intentional crate-private module" - )] - use std::sync::Arc; - use std::sync::LazyLock; - use std::sync::atomic::{AtomicBool, Ordering}; - use tokio::sync::MutexGuard; - - static PROPOSALS_FLAG_LOCK: LazyLock> = - LazyLock::new(|| tokio::sync::Mutex::new(())); - - /// Guard for tests that toggle the process-wide - /// `AGENT_PROPOSALS_ENABLED` flag. Acquires a process-wide async mutex, - /// swaps in the requested value, and restores the previous value on drop. - /// Hold the guard for the duration of any code that reads - /// `agent_proposals_enabled()`. - pub(crate) struct ProposalsFlagGuard { - prev: bool, - flag: Arc, - _lock: MutexGuard<'static, ()>, - } - - impl ProposalsFlagGuard { - pub(crate) async fn set(enabled: bool) -> Self { - let lock = PROPOSALS_FLAG_LOCK.lock().await; - Self::with_lock(enabled, lock) - } - - pub(crate) fn set_blocking(enabled: bool) -> Self { - let lock = PROPOSALS_FLAG_LOCK.blocking_lock(); - Self::with_lock(enabled, lock) - } - - fn with_lock(enabled: bool, lock: MutexGuard<'static, ()>) -> Self { - let flag = super::AGENT_PROPOSALS_ENABLED - .get_or_init(|| Arc::new(AtomicBool::new(false))) - .clone(); - let prev = flag.swap(enabled, Ordering::Relaxed); - Self { - prev, - flag, - _lock: lock, - } - } - } - - impl Drop for ProposalsFlagGuard { - fn drop(&mut self) { - self.flag.store(self.prev, Ordering::Relaxed); - } - } -} - -use crate::identity::BinaryIdentityCache; -use crate::l7::tls::{ - CertCache, ProxyTlsState, SandboxCa, build_upstream_client_config, read_system_ca_bundle, - write_ca_files, -}; -use crate::opa::OpaEngine; -use crate::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; -use crate::proxy::ProxyHandle; -#[cfg(target_os = "linux")] -use crate::sandbox::linux::netns::NetworkNamespace; -pub use process::{ProcessHandle, ProcessStatus}; -pub use sandbox::apply_supervisor_startup_hardening; - -/// Default interval (seconds) for re-fetching the inference route bundle from -/// the gateway in cluster mode. Override at runtime with the -/// `OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS` environment variable. -/// File-based routes (`--inference-routes`) are loaded once at startup and never -/// refreshed. -const DEFAULT_ROUTE_REFRESH_INTERVAL_SECS: u64 = 5; - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum InferenceRouteSource { - File, - Cluster, - None, -} - -fn infer_route_source( - sandbox_id: Option<&str>, - openshell_endpoint: Option<&str>, - inference_routes: Option<&str>, -) -> InferenceRouteSource { - if inference_routes.is_some() { - InferenceRouteSource::File - } else if sandbox_id.is_some() && openshell_endpoint.is_some() { - InferenceRouteSource::Cluster - } else { - InferenceRouteSource::None - } -} +pub(crate) use openshell_core::proposals::AGENT_PROPOSALS_ENABLED; -fn disable_inference_on_empty_routes(source: InferenceRouteSource) -> bool { - !matches!(source, InferenceRouteSource::Cluster) -} - -fn route_refresh_interval_secs() -> u64 { - let Ok(value) = std::env::var("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS") else { - return DEFAULT_ROUTE_REFRESH_INTERVAL_SECS; - }; - match value.parse::() { - Ok(interval) if interval > 0 => interval, - Ok(_) => { - warn!( - default_interval_secs = DEFAULT_ROUTE_REFRESH_INTERVAL_SECS, - "Ignoring zero route refresh interval" - ); - DEFAULT_ROUTE_REFRESH_INTERVAL_SECS - } - Err(error) => { - warn!( - interval = %value, - error = %error, - default_interval_secs = DEFAULT_ROUTE_REFRESH_INTERVAL_SECS, - "Ignoring invalid route refresh interval" - ); - DEFAULT_ROUTE_REFRESH_INTERVAL_SECS - } - } -} - -#[cfg(target_os = "linux")] -static MANAGED_CHILDREN: LazyLock>> = - LazyLock::new(|| Mutex::new(HashSet::new())); - -#[cfg(target_os = "linux")] -pub(crate) fn register_managed_child(pid: u32) { - let Ok(pid) = i32::try_from(pid) else { - return; - }; - if pid <= 0 { - return; - } - if let Ok(mut children) = MANAGED_CHILDREN.lock() { - children.insert(pid); - } -} - -#[cfg(target_os = "linux")] -pub(crate) fn unregister_managed_child(pid: u32) { - let Ok(pid) = i32::try_from(pid) else { - return; - }; - if pid <= 0 { - return; - } - if let Ok(mut children) = MANAGED_CHILDREN.lock() { - children.remove(&pid); - } -} - -#[cfg(target_os = "linux")] -fn is_managed_child(pid: i32) -> bool { - MANAGED_CHILDREN - .lock() - .is_ok_and(|children| children.contains(&pid)) -} +use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; +use openshell_core::provider_credentials::ProviderCredentialState; +use openshell_supervisor_network::opa::OpaEngine; +pub use openshell_supervisor_process::process::{ProcessHandle, ProcessStatus}; +use openshell_supervisor_process::skills; /// Run a command in the sandbox. /// /// # Errors /// /// Returns an error if the command fails to start or encounters a fatal error. -#[allow(clippy::too_many_arguments, clippy::similar_names)] +#[allow( + clippy::too_many_arguments, + clippy::similar_names, + clippy::fn_params_excessive_bools +)] pub async fn run_sandbox( command: Vec, workdir: Option, @@ -296,6 +83,8 @@ pub async fn run_sandbox( _health_port: u16, inference_routes: Option, ocsf_enabled: Arc, + network_enabled: bool, + process_enabled: bool, ) -> Result { let (program, args) = command .split_first() @@ -311,18 +100,15 @@ pub async fn run_sandbox( |s| s.trim().to_string(), ); - if OCSF_CTX - .set(SandboxContext { - sandbox_id: sandbox_id.clone().unwrap_or_default(), - sandbox_name: sandbox.as_deref().unwrap_or_default().to_string(), - container_image: std::env::var("OPENSHELL_CONTAINER_IMAGE").unwrap_or_default(), - hostname, - product_version: openshell_core::VERSION.to_string(), - proxy_ip: std::net::IpAddr::from([127, 0, 0, 1]), - proxy_port: 3128, - }) - .is_err() - { + if !openshell_ocsf::ctx::set_ctx(SandboxContext { + sandbox_id: sandbox_id.clone().unwrap_or_default(), + sandbox_name: sandbox.as_deref().unwrap_or_default().to_string(), + container_image: std::env::var("OPENSHELL_CONTAINER_IMAGE").unwrap_or_default(), + hostname, + product_version: openshell_core::VERSION.to_string(), + proxy_ip: std::net::IpAddr::from([127, 0, 0, 1]), + proxy_port: 3128, + }) { debug!("OCSF context already initialized, keeping existing"); } } @@ -338,23 +124,13 @@ pub async fn run_sandbox( policy_data, ) .await?; - let policy_local_ctx = Arc::new(policy_local::PolicyLocalContext::new( - retained_proto.clone(), - openshell_endpoint.clone(), - sandbox_name_for_agg.clone().or_else(|| sandbox_id.clone()), - )); - - // Validate that the required "sandbox" user exists in this image. - // All sandbox images must include this user for privilege dropping. - #[cfg(unix)] - validate_sandbox_user(&policy)?; // Fetch provider environment variables from the server. // This is done after loading the policy so the sandbox can still start // even if provider env fetch fails (graceful degradation). let (provider_env_revision, provider_env, provider_credential_expires_at_ms) = if let (Some(id), Some(endpoint)) = (&sandbox_id, &openshell_endpoint) { - match grpc_client::fetch_provider_environment(endpoint, id).await { + match openshell_core::grpc_client::fetch_provider_environment(endpoint, id).await { Ok(result) => { ocsf_emit!( ConfigStateChangeBuilder::new(ocsf_ctx()) @@ -399,31 +175,13 @@ pub async fn run_sandbox( ) }; - let provider_credentials = provider_credentials::ProviderCredentialState::from_environment( + let provider_credentials = ProviderCredentialState::from_environment( provider_env_revision, provider_env, provider_credential_expires_at_ms, ); let provider_env = provider_credentials.snapshot().child_env.clone(); - // Create identity cache for SHA256 TOFU when OPA is active - let identity_cache = opa_engine - .as_ref() - .map(|_| Arc::new(BinaryIdentityCache::new())); - - // Prepare filesystem: create and chown read_write directories - prepare_filesystem(&policy)?; - - #[cfg(target_os = "linux")] - { - let pid_limit_mode = if std::env::var_os("OPENSHELL_REQUIRE_RUNTIME_PID_LIMIT").is_some() { - process::RuntimePidLimitMode::Require - } else { - process::RuntimePidLimitMode::Warn - }; - process::check_runtime_pid_limit(pid_limit_mode)?; - } - // Initialize the agent-proposals feature flag. Default false until the // initial settings fetch (or the poll loop) tells us otherwise. The flag // gates the skill install, the policy.local route handler, and the L7 @@ -436,520 +194,56 @@ pub async fn run_sandbox( debug!("agent proposals flag already initialized, keeping existing"); } - // Eagerly fetch the initial settings so skill install can honor the flag - // at startup rather than waiting for the poll loop's first tick. In - // offline/file-mode there is no gateway, so the flag stays false. - if let (Some(id), Some(endpoint)) = (&sandbox_id, &openshell_endpoint) - && let Ok(client) = grpc_client::CachedOpenShellClient::connect(endpoint).await - && let Ok(result) = client.poll_settings(id).await - { - let initial = extract_bool_setting( - &result.settings, - openshell_core::settings::AGENT_POLICY_PROPOSALS_ENABLED_KEY, - ) - .unwrap_or(false); - proposals_enabled.store(initial, Ordering::Relaxed); - } - - if agent_proposals_enabled() { - match skills::install_static_skills() { - Ok(installed) => { - info!( - path = %installed.policy_advisor.display(), - "Installed sandbox agent skill" - ); - } - Err(error) => { - warn!(error = %error, "Failed to install sandbox agent skill"); - } - } - } else { - debug!("agent_policy_proposals_enabled is false at startup; skipping skill install"); - } - - // Generate ephemeral CA and TLS state for HTTPS L7 inspection. - // The CA cert is written to disk so sandbox processes can trust it. - let (tls_state, ca_file_paths) = if matches!(policy.network.mode, NetworkMode::Proxy) { - match SandboxCa::generate() { - Ok(ca) => { - let tls_dir = std::path::Path::new("/etc/openshell-tls"); - let system_ca_bundle = read_system_ca_bundle(); - match write_ca_files(&ca, tls_dir, &system_ca_bundle) { - Ok(paths) => { - // /etc/openshell-tls is subsumed by the /etc baseline - // path injected by enrich_*_baseline_paths(), so no - // explicit Landlock entry is needed here. - - let upstream_config = build_upstream_client_config(&system_ca_bundle); - let cert_cache = CertCache::new(ca); - let state = Arc::new(ProxyTlsState::new(cert_cache, upstream_config)); - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Enabled, "enabled") - .message("TLS termination enabled: ephemeral CA generated") - .build() - ); - (Some(state), Some(paths)) - } - Err(e) => { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .state(StateId::Disabled, "disabled") - .message(format!( - "Failed to write CA files, TLS termination disabled: {e}" - )) - .build() - ); - (None, None) - } - } - } - Err(e) => { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .state(StateId::Disabled, "disabled") - .message(format!( - "Failed to generate ephemeral CA, TLS termination disabled: {e}" - )) - .build() - ); - (None, None) - } - } - } else { - (None, None) - }; - - // Create network namespace for proxy mode (Linux only) - // This must be created before the proxy AND SSH server so that SSH - // sessions can enter the namespace for network isolation. - #[cfg(target_os = "linux")] - let netns = if matches!(policy.network.mode, NetworkMode::Proxy) { - match NetworkNamespace::create() { - Ok(ns) => { - // Install bypass detection rules (nftables log + reject). - // This provides fast-fail UX and diagnostic logging for direct - // connection attempts that bypass the HTTP CONNECT proxy. - let proxy_port = policy - .network - .proxy - .as_ref() - .and_then(|p| p.http_addr) - .map_or(3128, |addr| addr.port()); - if let Err(e) = ns.install_bypass_rules(proxy_port) { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .state(StateId::Disabled, "degraded") - .message(format!( - "Failed to install bypass detection rules (non-fatal): {e}" - )) - .build() - ); - } - Some(ns) - } - Err(e) => { - return Err(miette::miette!( - "Network namespace creation failed and proxy mode requires isolation. \ - Ensure CAP_NET_ADMIN and CAP_SYS_ADMIN are available and iproute2 is installed. \ - Error: {e}" - )); - } - } - } else { - None - }; - - // On non-Linux, network namespace isolation is not supported - #[cfg(not(target_os = "linux"))] - #[allow(clippy::no_effect_underscore_binding)] - let _netns: Option<()> = None; - - // Install the supervisor seccomp prelude after privileged startup helpers - // (network namespace setup, nftables probes) complete, but before the SSH - // listener and workload process are exposed. - apply_supervisor_startup_hardening()?; - // Shared PID: set after process spawn so the proxy can look up // the entrypoint process's /proc/net/tcp for identity binding. let entrypoint_pid = Arc::new(AtomicU32::new(0)); - let (_proxy, denial_rx, bypass_denial_tx) = if matches!(policy.network.mode, NetworkMode::Proxy) - { - let proxy_policy = policy.network.proxy.as_ref().ok_or_else(|| { - miette::miette!("Network mode is set to proxy but no proxy configuration was provided") - })?; - - let engine = opa_engine.clone().ok_or_else(|| { - miette::miette!("Proxy mode requires an OPA engine (--rego-policy and --rego-data)") - })?; - - let cache = identity_cache.clone().ok_or_else(|| { - miette::miette!("Proxy mode requires an identity cache (OPA engine must be configured)") - })?; - - // If we have a network namespace, bind to the veth host IP so sandboxed - // processes can reach the proxy via TCP. - #[cfg(target_os = "linux")] - let bind_addr = netns.as_ref().map(|ns| { - let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); - SocketAddr::new(ns.host_ip(), port) - }); - - #[cfg(not(target_os = "linux"))] - let bind_addr: Option = None; - - // Build inference context for local routing of intercepted inference calls. - let inference_ctx = build_inference_context( - sandbox_id.as_deref(), - openshell_endpoint_for_proxy.as_deref(), - inference_routes.as_deref(), - ) - .await?; - - // Create denial aggregator channel if in gRPC mode (sandbox_id present). - // Clone the sender for the bypass monitor before passing to the proxy. - let (denial_tx, denial_rx, bypass_denial_tx) = if sandbox_id.is_some() { - let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); - let bypass_tx = tx.clone(); - (Some(tx), Some(rx), Some(bypass_tx)) - } else { - (None, None, None) - }; - - let proxy_handle = ProxyHandle::start_with_bind_addr( - proxy_policy, - bind_addr, - engine, - cache, - entrypoint_pid.clone(), - tls_state, - inference_ctx, - Some(provider_credentials.clone()), - Some(policy_local_ctx.clone()), - denial_tx, - ) - .await?; - (Some(proxy_handle), denial_rx, bypass_denial_tx) + // Create the workload's network namespace. It is shared infrastructure: + // the proxy binds to its host-side veth IP, the bypass monitor reads + // /dev/kmsg from inside it, and the workload child / SSH sessions enter + // it via setns(). The RAII handle lives in this frame for the duration + // of the sandbox. + #[cfg(target_os = "linux")] + let netns = if network_enabled { + openshell_supervisor_network::run::create_netns_for_proxy(&policy)? } else { - (None, None, None) + None }; - // Spawn bypass detection monitor (Linux only, proxy mode only). - // Reads /dev/kmsg for nftables log entries and emits structured - // tracing events for direct connection attempts that bypass the proxy. - #[cfg(target_os = "linux")] - let _bypass_monitor = netns.as_ref().and_then(|ns| { - bypass_monitor::spawn( - ns.name().to_string(), - entrypoint_pid.clone(), - bypass_denial_tx, + let mut networking = if network_enabled { + Some( + openshell_supervisor_network::run::run_networking( + &policy, + #[cfg(target_os = "linux")] + netns.as_ref(), + opa_engine.as_ref(), + retained_proto.as_ref(), + entrypoint_pid.clone(), + &provider_credentials, + sandbox_id.as_deref(), + sandbox_name_for_agg.as_deref(), + openshell_endpoint_for_proxy.as_deref(), + inference_routes.as_deref(), + ) + .await?, ) - }); - - // On non-Linux, bypass_denial_tx is unused (no /dev/kmsg). - #[cfg(not(target_os = "linux"))] - drop(bypass_denial_tx); - - // Compute the proxy URL and netns fd for SSH sessions. - // SSH shell processes need both to enforce network policy: - // - netns_fd: enter the network namespace via setns() so all traffic - // goes through the veth pair (hard enforcement, non-bypassable) - // - proxy_url: set proxy env vars so cooperative tools route through the - // CONNECT proxy; this also opts Node.js into honoring those vars - #[cfg(target_os = "linux")] - let ssh_netns_fd = netns.as_ref().and_then(NetworkNamespace::ns_fd); - - #[cfg(not(target_os = "linux"))] - let ssh_netns_fd: Option = None; - - let ssh_proxy_url = if matches!(policy.network.mode, NetworkMode::Proxy) { - #[cfg(target_os = "linux")] - { - netns.as_ref().map(|ns| { - let port = policy - .network - .proxy - .as_ref() - .and_then(|p| p.http_addr) - .map_or(3128, |addr| addr.port()); - format!("http://{}:{port}", ns.host_ip()) - }) - } - #[cfg(not(target_os = "linux"))] - { - policy - .network - .proxy - .as_ref() - .and_then(|p| p.http_addr) - .map(|addr| format!("http://{addr}")) - } } else { None }; - // Zombie reaper — openshell-sandbox may run as PID 1 in containers and - // must reap orphaned grandchildren (e.g. background daemons started by - // coding agents) to prevent zombie accumulation. - // - // Use waitid(..., WNOWAIT) so we can inspect exited children before - // actually reaping them. This avoids racing explicit `child.wait()` calls - // for managed children (entrypoint and SSH session processes). - #[cfg(target_os = "linux")] - tokio::spawn(async { - use nix::sys::wait::{Id, WaitPidFlag, WaitStatus, waitid, waitpid}; - use tokio::signal::unix::{SignalKind, signal}; - use tokio::time::MissedTickBehavior; - - let mut sigchld = match signal(SignalKind::child()) { - Ok(s) => s, - Err(e) => { - tracing::warn!(error = %e, "Failed to register SIGCHLD handler for zombie reaping"); - return; - } - }; - let mut retry = tokio::time::interval(Duration::from_secs(5)); - retry.set_missed_tick_behavior(MissedTickBehavior::Skip); - - loop { - tokio::select! { - _ = sigchld.recv() => {} - _ = retry.tick() => {} - } - - loop { - let status = match waitid( - Id::All, - WaitPidFlag::WEXITED | WaitPidFlag::WNOHANG | WaitPidFlag::WNOWAIT, - ) { - Ok(WaitStatus::StillAlive) | Err(nix::errno::Errno::ECHILD) => break, - Ok(status) => status, - Err(nix::errno::Errno::EINTR) => continue, - Err(e) => { - tracing::debug!(error = %e, "waitid error during zombie reaping"); - break; - } - }; - - let Some(pid) = status.pid() else { - break; - }; - - if is_managed_child(pid.as_raw()) { - // Let the explicit waiter own this child status. - break; - } - - match waitpid(pid, Some(WaitPidFlag::WNOHANG)) { - Ok(WaitStatus::StillAlive) - | Err(nix::errno::Errno::ECHILD | nix::errno::Errno::EINTR) => {} - Ok(reaped) => { - tracing::debug!(?reaped, "Reaped orphaned child process"); - } - Err(e) => { - tracing::debug!(error = %e, "waitpid error during orphan reap"); - break; - } - } - } - } - }); - - let ssh_socket_path: Option = ssh_socket_path.map(std::path::PathBuf::from); - if let Some(listen_path) = ssh_socket_path.clone() { - let policy_clone = policy.clone(); - let workdir_clone = workdir.clone(); - let proxy_url = ssh_proxy_url; - let netns_fd = ssh_netns_fd; - let ca_paths = ca_file_paths.clone(); - let provider_credentials_clone = provider_credentials.clone(); - - let (ssh_ready_tx, ssh_ready_rx) = tokio::sync::oneshot::channel(); - - tokio::spawn(async move { - if let Err(err) = ssh::run_ssh_server( - listen_path, - ssh_ready_tx, - policy_clone, - workdir_clone, - netns_fd, - proxy_url, - ca_paths, - provider_credentials_clone, - ) - .await - { - ocsf_emit!( - AppLifecycleBuilder::new(ocsf_ctx()) - .activity(ActivityId::Fail) - .severity(SeverityId::Critical) - .status(StatusId::Failure) - .message(format!("SSH server failed: {err}")) - .build() - ); - } - }); - - // Wait for the SSH server to bind its socket before spawning the - // entrypoint process. This prevents exec requests from racing against - // SSH server startup when Kubernetes marks the pod Ready. - match timeout(Duration::from_secs(10), ssh_ready_rx).await { - Ok(Ok(Ok(()))) => { - ocsf_emit!( - AppLifecycleBuilder::new(ocsf_ctx()) - .activity(ActivityId::Open) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .message("SSH server is ready to accept connections") - .build() - ); - } - Ok(Ok(Err(err))) => { - return Err(err.context("SSH server failed during startup")); - } - Ok(Err(_)) => { - return Err(miette::miette!( - "SSH server task panicked before signaling ready" - )); - } - Err(_) => { - return Err(miette::miette!( - "SSH server did not start within 10 seconds" - )); - } - } - } - - // Spawn the persistent supervisor session if we have a gateway endpoint - // and sandbox identity. The session provides relay channels for SSH - // connect and ExecSandbox through the gateway. - if let (Some(endpoint), Some(id), Some(socket)) = ( - openshell_endpoint.as_ref(), - sandbox_id.as_ref(), - ssh_socket_path.as_ref(), - ) { - supervisor_session::spawn(endpoint.clone(), id.clone(), socket.clone(), ssh_netns_fd); - info!("supervisor session task spawned"); - } - - #[cfg(target_os = "linux")] - let mut handle = ProcessHandle::spawn( - program, - args, - workdir.as_deref(), - interactive, - &policy, - netns.as_ref(), - ca_file_paths.as_ref(), - &provider_env, - )?; - - #[cfg(not(target_os = "linux"))] - let mut handle = ProcessHandle::spawn( - program, - args, - workdir.as_deref(), - interactive, - &policy, - ca_file_paths.as_ref(), - &provider_env, - )?; - - // Store the entrypoint PID so the proxy can resolve TCP peer identity - entrypoint_pid.store(handle.pid(), Ordering::Release); - ocsf_emit!( - ProcessActivityBuilder::new(ocsf_ctx()) - .activity(ActivityId::Open) - .action(ActionId::Allowed) - .disposition(DispositionId::Allowed) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .launch_type(LaunchTypeId::Spawn) - .process(OcsfProcess::new(program, i64::from(handle.pid()))) - .message(format!("Process started: pid={}", handle.pid())) - .build() - ); - - // Spawn a task to resolve policy binary symlinks after the container - // filesystem becomes accessible via /proc//root/. This expands - // symlinks like /usr/bin/python3 → /usr/bin/python3.11 in the OPA - // policy data so that either path matches at evaluation time. - // - // We cannot do this synchronously here because the child process has - // just been spawned and its mount namespace / procfs entries may not - // be fully populated yet. Instead, we probe with retries until - // /proc//root/ is accessible or we exhaust attempts. - if let (Some(engine), Some(proto)) = (&opa_engine, &retained_proto) { - let resolve_engine = engine.clone(); - let resolve_proto = proto.clone(); - let resolve_pid = entrypoint_pid.clone(); - tokio::spawn(async move { - let pid = resolve_pid.load(Ordering::Acquire); - let probe_path = format!("/proc/{pid}/root/"); - // Retry up to 10 times with 500ms intervals (5s total). - // The child's mount namespace is typically ready within a - // few hundred ms of spawn. - for attempt in 1..=10 { - tokio::time::sleep(Duration::from_millis(500)).await; - if std::fs::metadata(&probe_path).is_ok() { - info!( - pid = pid, - attempt = attempt, - "Container filesystem accessible, resolving policy binary symlinks" - ); - match resolve_engine.reload_from_proto_with_pid(&resolve_proto, pid) { - Ok(()) => { - info!( - pid = pid, - "Policy binary symlink resolution complete \ - (check logs above for per-binary results)" - ); - } - Err(e) => { - warn!( - "Failed to rebuild OPA engine with symlink resolution \ - (non-fatal, falling back to literal path matching): {e}" - ); - } - } - return; - } - debug!( - pid = pid, - attempt = attempt, - probe_path = %probe_path, - "Container filesystem not yet accessible, retrying symlink resolution" - ); - } - warn!( - "Container filesystem /proc/{pid}/root/ not accessible after 10 attempts (5s); \ - binary symlink resolution skipped. Policy binary paths will be matched literally. \ - If binaries are symlinks, use canonical paths in your policy \ - (run 'readlink -f ' inside the sandbox)" - ); - }); - } - // Spawn background policy poll task (gRPC mode only). - if let (Some(id), Some(endpoint), Some(engine)) = - (&sandbox_id, &openshell_endpoint, &opa_engine) - { - let poll_id = id.clone(); - let poll_endpoint = endpoint.clone(); + if let (Some(id), Some(endpoint), Some(engine)) = ( + sandbox_id.as_deref(), + openshell_endpoint.as_deref(), + opa_engine.as_ref(), + ) { + let poll_id = id.to_string(); + let poll_endpoint = endpoint.to_string(); let poll_engine = engine.clone(); let poll_ocsf_enabled = ocsf_enabled.clone(); let poll_pid = entrypoint_pid.clone(); let poll_provider_credentials = provider_credentials.clone(); - let poll_policy_local = policy_local_ctx.clone(); + let poll_policy_local = networking.as_ref().map(|n| n.policy_local_ctx.clone()); let poll_interval_secs: u64 = std::env::var("OPENSHELL_POLICY_POLL_INTERVAL_SECS") .ok() .and_then(|v| v.parse().ok()) @@ -962,7 +256,7 @@ pub async fn run_sandbox( interval_secs: poll_interval_secs, ocsf_enabled: poll_ocsf_enabled, provider_credentials: poll_provider_credentials, - policy_local_ctx: Some(poll_policy_local), + policy_local_ctx: poll_policy_local, }; tokio::spawn(async move { @@ -971,392 +265,90 @@ pub async fn run_sandbox( AppLifecycleBuilder::new(ocsf_ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Medium) - .status(StatusId::Failure) - .message(format!("Policy poll loop exited with error: {e}")) - .build() - ); - } - }); - - // Spawn denial aggregator (gRPC mode only, when proxy is active). - if let Some(rx) = denial_rx { - // SubmitPolicyAnalysis resolves by sandbox *name*, not UUID. - let agg_name = sandbox_name_for_agg.clone().unwrap_or_else(|| id.clone()); - let agg_endpoint = endpoint.clone(); - let flush_interval_secs: u64 = std::env::var("OPENSHELL_DENIAL_FLUSH_INTERVAL_SECS") - .ok() - .and_then(|v| v.parse().ok()) - .unwrap_or(10); - - let aggregator = denial_aggregator::DenialAggregator::new(rx, flush_interval_secs); - - tokio::spawn(async move { - aggregator - .run(|summaries| { - let endpoint = agg_endpoint.clone(); - let sandbox_name = agg_name.clone(); - async move { - if let Err(e) = - flush_proposals_to_gateway(&endpoint, &sandbox_name, summaries) - .await - { - warn!(error = %e, "Failed to flush denial summaries to gateway"); - } - } - }) - .await; - }); - } - } - - // Wait for process with optional timeout - let result = if timeout_secs > 0 { - if let Ok(result) = timeout(Duration::from_secs(timeout_secs), handle.wait()).await { - result - } else { - ocsf_emit!( - ProcessActivityBuilder::new(ocsf_ctx()) - .activity(ActivityId::Close) - .action(ActionId::Denied) - .disposition(DispositionId::Blocked) - .severity(SeverityId::Critical) - .status(StatusId::Failure) - .message("Process timed out, killing") - .build() - ); - handle.kill()?; - return Ok(124); // Standard timeout exit code - } - } else { - handle.wait().await - }; - - let status = result.into_diagnostic()?; - - ocsf_emit!( - ProcessActivityBuilder::new(ocsf_ctx()) - .activity(ActivityId::Close) - .action(ActionId::Allowed) - .disposition(DispositionId::Allowed) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .exit_code(status.code()) - .message(format!("Process exited with code {}", status.code())) - .build() - ); - - Ok(status.code()) -} - -/// Build an inference context for local routing, if route sources are available. -/// -/// Route sources (in priority order): -/// 1. Inference routes file (standalone mode) — always takes precedence -/// 2. Cluster bundle (fetched from gateway via gRPC) -/// -/// If both a routes file and cluster credentials are provided, the routes file -/// wins and the cluster bundle is not fetched. -/// -/// Returns `None` if neither source is configured (inference routing disabled). -// `routes`/`router` are intentionally distinct nouns (the route list vs the -// router that consumes them); both names are clearer than alternatives. -#[allow(clippy::similar_names)] -async fn build_inference_context( - sandbox_id: Option<&str>, - openshell_endpoint: Option<&str>, - inference_routes: Option<&str>, -) -> Result>> { - use openshell_router::Router; - use openshell_router::config::RouterConfig; - - let source = infer_route_source(sandbox_id, openshell_endpoint, inference_routes); - - // Captured during the initial cluster bundle fetch so the background refresh - // loop can skip no-op updates from the very first tick. - let mut initial_revision: Option = None; - - let routes = match source { - InferenceRouteSource::File => { - let Some(path) = inference_routes else { - return Ok(None); - }; - - // Standalone mode: load routes from file (fail-fast on errors) - if sandbox_id.is_some() { - ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Enabled, "loaded") - .unmapped("inference_routes", serde_json::json!(path)) - .message(format!( - "Inference routes file takes precedence over cluster bundle [path:{path}]" - )) - .build()); - } - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Other, "loading") - .unmapped("inference_routes", serde_json::json!(path)) - .message(format!("Loading inference routes from file [path:{path}]")) - .build() - ); - let config = RouterConfig::load_from_file(std::path::Path::new(path)) - .map_err(|e| miette::miette!("failed to load inference routes {path}: {e}"))?; - config - .resolve_routes() - .map_err(|e| miette::miette!("failed to resolve routes from {path}: {e}"))? - } - InferenceRouteSource::Cluster => { - let (Some(_id), Some(endpoint)) = (sandbox_id, openshell_endpoint) else { - return Ok(None); - }; - - // Cluster mode: fetch bundle from gateway - info!(endpoint = %endpoint, "Fetching inference route bundle from gateway"); - match grpc_client::fetch_inference_bundle(endpoint).await { - Ok(bundle) => { - initial_revision = Some(bundle.revision.clone()); - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Enabled, "loaded") - .unmapped("route_count", serde_json::json!(bundle.routes.len())) - .unmapped("revision", serde_json::json!(&bundle.revision)) - .message(format!( - "Loaded inference route bundle [route_count:{} revision:{}]", - bundle.routes.len(), - bundle.revision - )) - .build() - ); - bundle_to_resolved_routes(&bundle) - } - Err(e) => { - // Distinguish expected "not configured" states from server errors. - // gRPC PermissionDenied/NotFound means inference bundle is unavailable - // for this sandbox — skip gracefully. Other errors are unexpected. - let msg = e.to_string(); - if msg.contains("permission denied") || msg.contains("not found") { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Disabled, "disabled") - .unmapped("error", serde_json::json!(e.to_string())) - .message(format!( - "Inference bundle unavailable, routing disabled [error:{e}]" - )) - .build() - ); - return Ok(None); - } - ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .state(StateId::Disabled, "disabled") - .unmapped("error", serde_json::json!(e.to_string())) - .message(format!( - "Failed to fetch inference bundle, inference routing disabled [error:{e}]" - )) - .build()); - return Ok(None); - } - } - } - InferenceRouteSource::None => { - // No route source — inference routing is not configured - return Ok(None); - } - }; - - if routes.is_empty() && disable_inference_on_empty_routes(source) { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Disabled, "disabled") - .message("No usable inference routes, inference routing disabled") - .build() - ); - return Ok(None); - } - - if routes.is_empty() { - ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Other, "waiting") - .message("Inference route bundle is empty; keeping routing enabled and waiting for refresh") - .build()); - } - - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Enabled, "enabled") - .unmapped("route_count", serde_json::json!(routes.len())) - .message(format!( - "Inference routing enabled with local execution [route_count:{}]", - routes.len() - )) - .build() - ); - - // Partition routes by name into user-facing and system caches. - let (user_routes, system_routes) = partition_routes(routes); - - let router = - Router::new().map_err(|e| miette::miette!("failed to initialize inference router: {e}"))?; - let patterns = l7::inference::default_patterns(); - - let ctx = Arc::new(proxy::InferenceContext::new( - patterns, - router, - user_routes, - system_routes, - )); - - // Spawn background route cache refresh for cluster mode at startup so - // request handling never depends on control-plane latency. - if matches!(source, InferenceRouteSource::Cluster) - && let (Some(_id), Some(endpoint)) = (sandbox_id, openshell_endpoint) - { - spawn_route_refresh( - ctx.route_cache(), - ctx.system_route_cache(), - endpoint.to_string(), - route_refresh_interval_secs(), - initial_revision, - ); - } - - Ok(Some(ctx)) -} - -/// Route name for the sandbox system inference route. -const SANDBOX_SYSTEM_ROUTE_NAME: &str = "sandbox-system"; - -/// Split resolved routes into user-facing and system caches by route name. -/// -/// Routes named `"sandbox-system"` go to the system cache; everything else -/// (including `"inference.local"` and empty names) goes to the user cache. -fn partition_routes( - routes: Vec, -) -> ( - Vec, - Vec, -) { - let mut user = Vec::new(); - let mut system = Vec::new(); - for r in routes { - if r.name == SANDBOX_SYSTEM_ROUTE_NAME { - system.push(r); - } else { - user.push(r); - } - } - (user, system) -} - -/// Convert a proto bundle response into resolved routes for the router. -pub(crate) fn bundle_to_resolved_routes( - bundle: &openshell_core::proto::GetInferenceBundleResponse, -) -> Vec { - bundle - .routes - .iter() - .map(|r| { - let (auth, default_headers, passthrough_headers) = - openshell_core::inference::route_headers_for_provider_type(&r.provider_type); - let timeout = if r.timeout_secs == 0 { - openshell_router::config::DEFAULT_ROUTE_TIMEOUT - } else { - Duration::from_secs(r.timeout_secs) - }; - openshell_router::config::ResolvedRoute { - name: r.name.clone(), - endpoint: r.base_url.clone(), - model: r.model_id.clone(), - api_key: r.api_key.clone(), - protocols: r.protocols.clone(), - auth, - default_headers, - passthrough_headers, - timeout, + .status(StatusId::Failure) + .message(format!("Policy poll loop exited with error: {e}")) + .build() + ); } - }) - .collect() -} - -/// Spawn a background task that periodically refreshes both route caches from the gateway. -/// -/// The loop uses the bundle `revision` hash to avoid unnecessary cache writes -/// when routes haven't changed. `initial_revision` is the revision captured -/// during the startup fetch in [`build_inference_context`] so the first refresh -/// cycle can already skip a no-op update. -pub(crate) fn spawn_route_refresh( - user_cache: Arc>>, - system_cache: Arc>>, - endpoint: String, - interval_secs: u64, - initial_revision: Option, -) { - tokio::spawn(async move { - use tokio::time::{MissedTickBehavior, interval}; + }); + } - let mut current_revision = initial_revision; + let exit_code = if process_enabled { + let (ssh_proxy_url, ssh_netns_fd, ca_file_paths) = match networking.as_mut() { + Some(n) => ( + n.ssh_proxy_url.take(), + n.ssh_netns_fd, + n.ca_file_paths.clone(), + ), + None => (None, None, None), + }; - let mut tick = interval(Duration::from_secs(interval_secs)); - tick.set_missed_tick_behavior(MissedTickBehavior::Skip); + openshell_supervisor_process::run::run_process( + program, + args, + workdir.as_deref(), + timeout_secs, + interactive, + sandbox_id.as_deref(), + openshell_endpoint.as_deref(), + ssh_socket_path, + &policy, + entrypoint_pid, + provider_credentials, + provider_env, + ssh_proxy_url, + ssh_netns_fd, + ca_file_paths, + #[cfg(target_os = "linux")] + netns.as_ref(), + ) + .await? + } else { + // Network-only sidecar mode: keep the proxy and its background + // tasks alive (held via the `networking` value) until SIGINT or + // SIGTERM. Exit 0 on clean shutdown. + wait_for_shutdown_signal().await; + 0 + }; - loop { - tick.tick().await; + // Drop networking explicitly so the proxy + bypass monitor RAII + // handles tear down before we return. + drop(networking); - match grpc_client::fetch_inference_bundle(&endpoint).await { - Ok(bundle) => { - if current_revision.as_deref() == Some(&bundle.revision) { - trace!(revision = %bundle.revision, "Inference bundle unchanged"); - continue; - } + Ok(exit_code) +} - let routes = bundle_to_resolved_routes(&bundle); - let (user_routes, system_routes) = partition_routes(routes); - ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Enabled, "updated") - .unmapped("user_route_count", serde_json::json!(user_routes.len())) - .unmapped("system_route_count", serde_json::json!(system_routes.len())) - .unmapped("revision", serde_json::json!(&bundle.revision)) - .message(format!( - "Inference routes updated [user_route_count:{} system_route_count:{} revision:{}]", - user_routes.len(), - system_routes.len(), - bundle.revision - )) - .build()); - current_revision = Some(bundle.revision); - *user_cache.write().await = user_routes; - *system_cache.write().await = system_routes; - } - Err(e) => { - ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Medium) - .status(StatusId::Failure) - .state(StateId::Other, "stale") - .unmapped("error", serde_json::json!(e.to_string())) - .message(format!( - "Failed to refresh inference route cache, keeping stale routes [error:{e}]" - )) - .build()); - } +/// Wait for SIGINT or SIGTERM. Used in network-only mode where there is +/// no entrypoint child whose lifetime drives the supervisor's exit. +async fn wait_for_shutdown_signal() { + #[cfg(unix)] + { + use tokio::signal::unix::{SignalKind, signal}; + let mut sigterm = match signal(SignalKind::terminate()) { + Ok(s) => s, + Err(e) => { + tracing::warn!( + error = %e, + "Failed to install SIGTERM handler; waiting on SIGINT only" + ); + let _ = tokio::signal::ctrl_c().await; + return; + } + }; + tokio::select! { + _ = tokio::signal::ctrl_c() => { + info!("Received SIGINT, shutting down network-only supervisor"); + } + _ = sigterm.recv() => { + info!("Received SIGTERM, shutting down network-only supervisor"); } } - }); + } + #[cfg(not(unix))] + { + let _ = tokio::signal::ctrl_c().await; + info!("Received Ctrl-C, shutting down network-only supervisor"); + } } // ============================================================================ @@ -1668,7 +660,7 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { )] mod baseline_tests { use super::*; - use crate::policy::{FilesystemPolicy, LandlockPolicy, ProcessPolicy}; + use openshell_core::policy::{FilesystemPolicy, LandlockPolicy, ProcessPolicy}; #[test] fn proc_not_in_both_read_only_and_read_write_when_gpu_present() { @@ -1985,8 +977,10 @@ async fn load_policy( endpoint = %endpoint, "Fetching sandbox policy via gRPC" ); - let proto_policy = - grpc_retry("Policy fetch", || grpc_client::fetch_policy(endpoint, id)).await?; + let proto_policy = grpc_retry("Policy fetch", || { + openshell_core::grpc_client::fetch_policy(endpoint, id) + }) + .await?; let mut proto_policy = if let Some(p) = proto_policy { p @@ -2016,7 +1010,12 @@ async fn load_policy( // Sync and re-fetch over a single connection to avoid extra // TLS handshakes. grpc_retry("Policy discovery sync", || { - grpc_client::discover_and_sync_policy(endpoint, id, sandbox, &discovered) + openshell_core::grpc_client::discover_and_sync_policy( + endpoint, + id, + sandbox, + &discovered, + ) }) .await? }; @@ -2027,7 +1026,9 @@ async fn load_policy( let enriched = enrich_proto_baseline_paths(&mut proto_policy); if enriched && let Some(sandbox_name) = sandbox.as_deref() - && let Err(e) = grpc_client::sync_policy(endpoint, sandbox_name, &proto_policy).await + && let Err(e) = + openshell_core::grpc_client::sync_policy(endpoint, sandbox_name, &proto_policy) + .await { warn!( error = %e, @@ -2164,211 +1165,9 @@ fn discover_policy_from_path(path: &std::path::Path) -> openshell_core::proto::S } } -/// Validate that the `sandbox` user exists in this image. -/// -/// All sandbox images must include a `sandbox` user for privilege dropping. -/// This check runs at supervisor startup (inside the container) where we can -/// inspect `/etc/passwd`. If the user is missing, the sandbox fails fast -/// with a clear error instead of silently running child processes as root. -#[cfg(unix)] -fn validate_sandbox_user(policy: &SandboxPolicy) -> Result<()> { - use nix::unistd::User; - - let user_name = policy.process.run_as_user.as_deref().unwrap_or("sandbox"); - - if user_name.is_empty() || user_name == "sandbox" { - match User::from_name("sandbox") { - Ok(Some(_)) => { - ocsf_emit!( - ConfigStateChangeBuilder::new(ocsf_ctx()) - .severity(SeverityId::Informational) - .status(StatusId::Success) - .state(StateId::Enabled, "validated") - .message("Validated 'sandbox' user exists in image") - .build() - ); - } - Ok(None) => { - return Err(miette::miette!( - "sandbox user 'sandbox' not found in image; \ - all sandbox images must include a 'sandbox' user and group" - )); - } - Err(e) => { - return Err(miette::miette!("failed to look up 'sandbox' user: {e}")); - } - } - } - - Ok(()) -} - -/// Prepare a `read_write` path for the sandboxed process. -/// -/// Returns `true` when the path was created by the supervisor and therefore -/// still needs to be chowned to the sandbox user/group. Existing paths keep -/// their image-defined ownership. -#[cfg(unix)] -fn prepare_read_write_path(path: &std::path::Path) -> Result { - // SECURITY: use symlink_metadata (lstat) to inspect each path *before* - // calling chown. chown follows symlinks, so a malicious container image - // could place a symlink (e.g. /sandbox -> /etc/shadow) to trick the - // root supervisor into transferring ownership of arbitrary files. - // The TOCTOU window between lstat and chown is not exploitable because - // no untrusted process is running yet (the child has not been forked). - if let Ok(meta) = std::fs::symlink_metadata(path) { - if meta.file_type().is_symlink() { - return Err(miette::miette!( - "read_write path '{}' is a symlink — refusing to chown (potential privilege escalation)", - path.display() - )); - } - - debug!( - path = %path.display(), - "Preserving ownership for existing read_write path" - ); - Ok(false) - } else { - debug!(path = %path.display(), "Creating read_write directory"); - std::fs::create_dir_all(path).into_diagnostic()?; - Ok(true) - } -} - -/// Prepare filesystem for the sandboxed process. -/// -/// Creates `read_write` directories if they don't exist and sets ownership -/// on newly-created paths to the configured sandbox user/group. This runs as -/// the supervisor (root) before forking the child process. -#[cfg(unix)] -fn prepare_filesystem(policy: &SandboxPolicy) -> Result<()> { - use nix::unistd::{Group, User, chown}; - - let user_name = match policy.process.run_as_user.as_deref() { - Some(name) if !name.is_empty() => Some(name), - _ => None, - }; - let group_name = match policy.process.run_as_group.as_deref() { - Some(name) if !name.is_empty() => Some(name), - _ => None, - }; - - // If no user/group configured, nothing to do - if user_name.is_none() && group_name.is_none() { - return Ok(()); - } - - // Resolve user and group - let uid = if let Some(name) = user_name { - Some( - User::from_name(name) - .into_diagnostic()? - .ok_or_else(|| miette::miette!("Sandbox user not found: {name}"))? - .uid, - ) - } else { - None - }; - - let gid = if let Some(name) = group_name { - Some( - Group::from_name(name) - .into_diagnostic()? - .ok_or_else(|| miette::miette!("Sandbox group not found: {name}"))? - .gid, - ) - } else { - None - }; - - // Create missing read_write paths and only chown the ones we created. - for path in &policy.filesystem.read_write { - if prepare_read_write_path(path)? { - debug!( - path = %path.display(), - ?uid, - ?gid, - "Setting ownership on newly created read_write path" - ); - chown(path, uid, gid).into_diagnostic()?; - } - } - - Ok(()) -} - -#[cfg(not(unix))] -fn prepare_filesystem(_policy: &SandboxPolicy) -> Result<()> { - Ok(()) -} - /// Background loop that polls the server for policy updates. /// /// When a new version is detected, attempts to reload the OPA engine via -/// Flush aggregated denial summaries to the gateway via `SubmitPolicyAnalysis`. -async fn flush_proposals_to_gateway( - endpoint: &str, - sandbox_name: &str, - summaries: Vec, -) -> Result<()> { - use crate::grpc_client::CachedOpenShellClient; - use openshell_core::proto::{DenialSummary, L7RequestSample}; - - let client = CachedOpenShellClient::connect(endpoint).await?; - - // Convert FlushableDenialSummary to proto DenialSummary. - let proto_summaries: Vec = summaries - .into_iter() - .map(|s| DenialSummary { - sandbox_id: String::new(), - host: s.host, - port: u32::from(s.port), - binary: s.binary, - ancestors: s.ancestors, - deny_reason: s.deny_reason, - first_seen_ms: s.first_seen_ms, - last_seen_ms: s.last_seen_ms, - count: s.count, - suppressed_count: 0, - total_count: s.count, - sample_cmdlines: s.sample_cmdlines, - binary_sha256: String::new(), - persistent: false, - denial_stage: s.denial_stage, - l7_request_samples: s - .l7_samples - .into_iter() - .map(|l| L7RequestSample { - method: l.method, - path: l.path, - decision: "deny".to_string(), - count: l.count, - }) - .collect(), - l7_inspection_active: false, - }) - .collect(); - - // Run the mechanistic mapper sandbox-side to generate proposals. - // The gateway is a thin persistence + validation layer — it never - // generates proposals itself. - let proposals = mechanistic_mapper::generate_proposals(&proto_summaries); - - info!( - sandbox_name = %sandbox_name, - summaries = proto_summaries.len(), - proposals = proposals.len(), - "Flushed denial analysis to gateway" - ); - - client - .submit_policy_analysis(sandbox_name, proto_summaries, proposals, "mechanistic") - .await?; - - Ok(()) -} - /// `reload_from_proto_with_pid()`. Reports load success/failure back to the /// server. On failure, the previous engine is untouched (LKG behavior). /// @@ -2381,12 +1180,12 @@ struct PolicyPollLoopContext { entrypoint_pid: Arc, interval_secs: u64, ocsf_enabled: Arc, - provider_credentials: provider_credentials::ProviderCredentialState, - policy_local_ctx: Option>, + provider_credentials: ProviderCredentialState, + policy_local_ctx: Option>, } async fn run_policy_poll_loop(ctx: PolicyPollLoopContext) -> Result<()> { - use crate::grpc_client::CachedOpenShellClient; + use openshell_core::grpc_client::CachedOpenShellClient; use openshell_core::proto::PolicySource; use std::sync::atomic::Ordering; @@ -2452,7 +1251,12 @@ async fn run_policy_poll_loop(ctx: PolicyPollLoopContext) -> Result<()> { .build()); if provider_env_changed { - match grpc_client::fetch_provider_environment(&ctx.endpoint, &ctx.sandbox_id).await { + match openshell_core::grpc_client::fetch_provider_environment( + &ctx.endpoint, + &ctx.sandbox_id, + ) + .await + { Ok(env_result) => { let env_count = ctx.provider_credentials.install_environment( env_result.provider_env_revision, @@ -2712,267 +1516,6 @@ fn format_setting_value(es: &openshell_core::proto::EffectiveSetting) -> String )] mod tests { use super::*; - use crate::policy::{FilesystemPolicy, LandlockPolicy, ProcessPolicy}; - #[cfg(unix)] - use nix::unistd::{Group, User}; - #[cfg(unix)] - use std::os::unix::fs::{MetadataExt, symlink}; - use temp_env::with_vars; - - static ENV_LOCK: LazyLock> = LazyLock::new(|| Mutex::new(())); - - #[test] - fn bundle_to_resolved_routes_converts_all_fields() { - let bundle = openshell_core::proto::GetInferenceBundleResponse { - routes: vec![ - openshell_core::proto::ResolvedRoute { - name: "frontier".to_string(), - base_url: "https://api.example.com/v1".to_string(), - api_key: "sk-test-key".to_string(), - model_id: "gpt-4".to_string(), - protocols: vec![ - "openai_chat_completions".to_string(), - "openai_responses".to_string(), - ], - provider_type: "openai".to_string(), - timeout_secs: 0, - }, - openshell_core::proto::ResolvedRoute { - name: "local".to_string(), - base_url: "http://vllm:8000/v1".to_string(), - api_key: "local-key".to_string(), - model_id: "llama-3".to_string(), - protocols: vec!["openai_chat_completions".to_string()], - provider_type: String::new(), - timeout_secs: 120, - }, - ], - revision: "abc123".to_string(), - generated_at_ms: 1000, - }; - - let routes = bundle_to_resolved_routes(&bundle); - - assert_eq!(routes.len(), 2); - assert_eq!(routes[0].endpoint, "https://api.example.com/v1"); - assert_eq!(routes[0].model, "gpt-4"); - assert_eq!(routes[0].api_key, "sk-test-key"); - assert_eq!( - routes[0].auth, - openshell_core::inference::AuthHeader::Bearer - ); - assert_eq!( - routes[0].protocols, - vec!["openai_chat_completions", "openai_responses"] - ); - assert_eq!( - routes[0].timeout, - openshell_router::config::DEFAULT_ROUTE_TIMEOUT, - "timeout_secs=0 should map to default" - ); - assert_eq!(routes[1].endpoint, "http://vllm:8000/v1"); - assert_eq!( - routes[1].auth, - openshell_core::inference::AuthHeader::Bearer - ); - assert_eq!( - routes[1].timeout, - Duration::from_secs(120), - "timeout_secs=120 should map to 120s" - ); - } - - #[test] - fn bundle_to_resolved_routes_handles_empty_bundle() { - let bundle = openshell_core::proto::GetInferenceBundleResponse { - routes: vec![], - revision: "empty".to_string(), - generated_at_ms: 0, - }; - - let routes = bundle_to_resolved_routes(&bundle); - assert!(routes.is_empty()); - } - - #[test] - fn bundle_to_resolved_routes_preserves_name_field() { - let bundle = openshell_core::proto::GetInferenceBundleResponse { - routes: vec![openshell_core::proto::ResolvedRoute { - name: "sandbox-system".to_string(), - base_url: "https://api.example.com/v1".to_string(), - api_key: "key".to_string(), - model_id: "model".to_string(), - protocols: vec!["openai_chat_completions".to_string()], - provider_type: "openai".to_string(), - timeout_secs: 0, - }], - revision: "rev".to_string(), - generated_at_ms: 0, - }; - - let routes = bundle_to_resolved_routes(&bundle); - assert_eq!(routes[0].name, "sandbox-system"); - } - - #[test] - fn routes_segregated_by_name() { - let routes = vec![ - openshell_router::config::ResolvedRoute { - name: "inference.local".to_string(), - endpoint: "https://api.openai.com/v1".to_string(), - model: "gpt-4o".to_string(), - api_key: "key1".to_string(), - protocols: vec!["openai_chat_completions".to_string()], - auth: openshell_core::inference::AuthHeader::Bearer, - default_headers: vec![], - passthrough_headers: vec![], - timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, - }, - openshell_router::config::ResolvedRoute { - name: "sandbox-system".to_string(), - endpoint: "https://api.anthropic.com/v1".to_string(), - model: "claude-sonnet-4-20250514".to_string(), - api_key: "key2".to_string(), - protocols: vec!["anthropic_messages".to_string()], - auth: openshell_core::inference::AuthHeader::Custom("x-api-key"), - default_headers: vec![], - passthrough_headers: vec![], - timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, - }, - ]; - - let (user, system) = partition_routes(routes); - assert_eq!(user.len(), 1); - assert_eq!(user[0].name, "inference.local"); - assert_eq!(system.len(), 1); - assert_eq!(system[0].name, "sandbox-system"); - } - - // -- build_inference_context tests -- - - #[tokio::test] - async fn build_inference_context_route_file_loads_routes() { - use std::io::Write; - - let yaml = r#" -routes: - - name: inference.local - endpoint: http://localhost:8000/v1 - model: llama-3 - protocols: [openai_chat_completions] - api_key: test-key -"#; - let mut f = tempfile::NamedTempFile::new().unwrap(); - f.write_all(yaml.as_bytes()).unwrap(); - let path = f.path().to_str().unwrap(); - - let ctx = build_inference_context(None, None, Some(path)) - .await - .expect("should load routes from file"); - - let ctx = ctx.expect("context should be Some"); - let cache = ctx.route_cache(); - let routes = cache.read().await; - assert_eq!(routes.len(), 1); - assert_eq!(routes[0].endpoint, "http://localhost:8000/v1"); - } - - #[tokio::test] - async fn build_inference_context_empty_route_file_returns_none() { - use std::io::Write; - - // Route file with empty routes list → inference routing disabled (not an error) - let yaml = "routes: []\n"; - let mut f = tempfile::NamedTempFile::new().unwrap(); - f.write_all(yaml.as_bytes()).unwrap(); - let path = f.path().to_str().unwrap(); - - let ctx = build_inference_context(None, None, Some(path)) - .await - .expect("empty routes file should not error"); - assert!( - ctx.is_none(), - "empty routes should disable inference routing" - ); - } - - #[tokio::test] - async fn build_inference_context_no_sources_returns_none() { - let ctx = build_inference_context(None, None, None) - .await - .expect("should succeed with None"); - - assert!(ctx.is_none(), "no sources should return None"); - } - - #[tokio::test] - async fn build_inference_context_route_file_overrides_cluster() { - use std::io::Write; - - let yaml = r#" -routes: - - name: inference.local - endpoint: http://localhost:9999/v1 - model: file-model - protocols: [openai_chat_completions] - api_key: file-key -"#; - let mut f = tempfile::NamedTempFile::new().unwrap(); - f.write_all(yaml.as_bytes()).unwrap(); - let path = f.path().to_str().unwrap(); - - // Even with sandbox_id and endpoint, route_file takes precedence - let ctx = build_inference_context(Some("sb-1"), Some("http://localhost:50051"), Some(path)) - .await - .expect("should load from file"); - - let ctx = ctx.expect("context should be Some"); - let cache = ctx.route_cache(); - let routes = cache.read().await; - assert_eq!(routes[0].endpoint, "http://localhost:9999/v1"); - } - - #[test] - fn infer_route_source_prefers_file_mode() { - assert_eq!( - infer_route_source( - Some("sb-1"), - Some("http://localhost:50051"), - Some("routes.yaml") - ), - InferenceRouteSource::File - ); - } - - #[test] - fn infer_route_source_cluster_requires_id_and_endpoint() { - assert_eq!( - infer_route_source(Some("sb-1"), Some("http://localhost:50051"), None), - InferenceRouteSource::Cluster - ); - assert_eq!( - infer_route_source(Some("sb-1"), None, None), - InferenceRouteSource::None - ); - assert_eq!( - infer_route_source(None, Some("http://localhost:50051"), None), - InferenceRouteSource::None - ); - } - - #[test] - fn disable_inference_on_empty_routes_depends_on_source() { - assert!(disable_inference_on_empty_routes( - InferenceRouteSource::File - )); - assert!(!disable_inference_on_empty_routes( - InferenceRouteSource::Cluster - )); - assert!(disable_inference_on_empty_routes( - InferenceRouteSource::None - )); - } - // ---- Policy disk discovery tests ---- #[test] @@ -3066,173 +1609,4 @@ filesystem_policy: let local_policy = SandboxPolicy::try_from(proto).expect("conversion should succeed"); assert!(matches!(local_policy.network.mode, NetworkMode::Proxy)); } - - // ---- Route refresh interval + revision tests ---- - - #[test] - fn default_route_refresh_interval_is_five_seconds() { - assert_eq!(DEFAULT_ROUTE_REFRESH_INTERVAL_SECS, 5); - } - - #[test] - fn route_refresh_interval_uses_env_override() { - let _guard = ENV_LOCK.lock().unwrap(); - with_vars( - [("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS", Some("9"))], - || { - assert_eq!(route_refresh_interval_secs(), 9); - }, - ); - } - - #[test] - fn route_refresh_interval_rejects_zero() { - let _guard = ENV_LOCK.lock().unwrap(); - with_vars( - [("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS", Some("0"))], - || { - assert_eq!( - route_refresh_interval_secs(), - DEFAULT_ROUTE_REFRESH_INTERVAL_SECS - ); - }, - ); - } - - #[test] - fn route_refresh_interval_rejects_invalid_values() { - let _guard = ENV_LOCK.lock().unwrap(); - with_vars( - [("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS", Some("abc"))], - || { - assert_eq!( - route_refresh_interval_secs(), - DEFAULT_ROUTE_REFRESH_INTERVAL_SECS - ); - }, - ); - } - - #[tokio::test] - async fn route_cache_preserves_content_when_not_written() { - use std::sync::Arc; - use tokio::sync::RwLock; - - let routes = vec![openshell_router::config::ResolvedRoute { - name: "inference.local".to_string(), - endpoint: "http://original:8000/v1".to_string(), - model: "original-model".to_string(), - api_key: "key".to_string(), - auth: openshell_core::inference::AuthHeader::Bearer, - protocols: vec!["openai_chat_completions".to_string()], - default_headers: vec![], - passthrough_headers: vec![], - timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, - }]; - - let cache = Arc::new(RwLock::new(routes)); - - // Verify the cache preserves its content — the revision-based skip - // logic in spawn_route_refresh ensures the cache is only written - // when the revision actually changes. - let read = cache.read().await; - assert_eq!(read.len(), 1); - assert_eq!(read[0].model, "original-model"); - } - - #[cfg(unix)] - fn sandbox_policy_with_read_write( - path: std::path::PathBuf, - run_as_user: Option, - run_as_group: Option, - ) -> SandboxPolicy { - SandboxPolicy { - version: 1, - filesystem: FilesystemPolicy { - read_only: vec![], - read_write: vec![path], - include_workdir: false, - }, - network: NetworkPolicy::default(), - landlock: LandlockPolicy::default(), - process: ProcessPolicy { - run_as_user, - run_as_group, - }, - } - } - - #[cfg(unix)] - #[test] - fn prepare_read_write_path_creates_missing_directory() { - let dir = tempfile::tempdir().unwrap(); - let missing = dir.path().join("missing").join("nested"); - - assert!(prepare_read_write_path(&missing).unwrap()); - assert!(missing.is_dir()); - } - - #[cfg(unix)] - #[test] - fn prepare_read_write_path_preserves_existing_directory() { - let dir = tempfile::tempdir().unwrap(); - let existing = dir.path().join("existing"); - std::fs::create_dir(&existing).unwrap(); - - assert!(!prepare_read_write_path(&existing).unwrap()); - assert!(existing.is_dir()); - } - - #[cfg(unix)] - #[test] - fn prepare_read_write_path_rejects_symlink() { - let dir = tempfile::tempdir().unwrap(); - let target = dir.path().join("target"); - let link = dir.path().join("link"); - std::fs::create_dir(&target).unwrap(); - symlink(&target, &link).unwrap(); - - let error = prepare_read_write_path(&link).unwrap_err(); - assert!( - error - .to_string() - .contains("is a symlink — refusing to chown"), - "unexpected error: {error}" - ); - } - - #[cfg(unix)] - #[test] - fn prepare_filesystem_skips_chown_for_existing_read_write_paths() { - if nix::unistd::geteuid().is_root() { - return; - } - - let current_user = User::from_uid(nix::unistd::geteuid()) - .unwrap() - .expect("current user entry"); - let restricted_group = Group::from_gid(nix::unistd::Gid::from_raw(0)) - .unwrap() - .expect("gid 0 group entry"); - if restricted_group.gid == nix::unistd::getegid() { - return; - } - - let dir = tempfile::tempdir().unwrap(); - let existing = dir.path().join("existing"); - std::fs::create_dir(&existing).unwrap(); - let before = std::fs::metadata(&existing).unwrap(); - - let policy = sandbox_policy_with_read_write( - existing.clone(), - Some(current_user.name), - Some(restricted_group.name), - ); - - prepare_filesystem(&policy).expect("existing path should not be re-owned"); - - let after = std::fs::metadata(&existing).unwrap(); - assert_eq!(after.uid(), before.uid()); - assert_eq!(after.gid(), before.gid()); - } } diff --git a/crates/openshell-sandbox/src/main.rs b/crates/openshell-sandbox/src/main.rs index 3c9e21578..91b145c2e 100644 --- a/crates/openshell-sandbox/src/main.rs +++ b/crates/openshell-sandbox/src/main.rs @@ -33,6 +33,45 @@ const COPY_SELF_SUBCOMMAND: &str = "copy-self"; /// to confirm the cross-sandbox IDOR guard fires. const DEBUG_RPC_SUBCOMMAND: &str = "debug-rpc"; +/// Default `--mode` value: run both supervisor leaves in a single binary. +const DEFAULT_MODE: &str = "network,process"; + +/// Which supervisor leaves are enabled in this process. +/// +/// Parsed from a comma-separated `--mode` value, e.g. `network`, +/// `process`, or `network,process`. At least one must be set. +#[derive(Clone, Copy, Debug)] +struct Mode { + network: bool, + process: bool, +} + +impl std::str::FromStr for Mode { + type Err = String; + + fn from_str(s: &str) -> Result { + let mut mode = Self { + network: false, + process: false, + }; + for part in s.split(',').map(str::trim).filter(|p| !p.is_empty()) { + match part { + "network" => mode.network = true, + "process" => mode.process = true, + other => { + return Err(format!( + "unknown mode component '{other}' (expected 'network' and/or 'process')" + )); + } + } + } + if !mode.network && !mode.process { + return Err("--mode must enable at least one of: network, process".into()); + } + Ok(mode) + } +} + /// `OpenShell` Sandbox - process isolation and monitoring. #[derive(Parser, Debug)] #[command(name = "openshell-sandbox")] @@ -105,6 +144,14 @@ struct Args { /// Port for health check endpoint. #[arg(long, default_value = "8080")] health_port: u16, + + /// Which supervisor components to run. Comma-separated list of + /// "network" and/or "process". Defaults to both (single-binary + /// topology). Use --mode=network for a network-only sidecar, or + /// --mode=process for a process-only supervisor when network + /// enforcement runs in another pod. + #[arg(long, default_value = DEFAULT_MODE)] + mode: Mode, } /// Copy the running executable to `dest`, creating parent directories as @@ -168,7 +215,7 @@ fn main() -> Result<()> { .into_diagnostic()?; return runtime.block_on(async move { let _ = rustls::crypto::ring::default_provider().install_default(); - let exit = openshell_sandbox::debug_rpc::run(&raw_args[2..]).await?; + let exit = openshell_supervisor_process::debug_rpc::run(&raw_args[2..]).await?; std::process::exit(exit); }); } @@ -206,11 +253,12 @@ fn main() -> Result<()> { let log_push_state = if let (Some(sandbox_id), Some(endpoint)) = (&args.sandbox_id, &args.openshell_endpoint) { - let (tx, handle) = openshell_sandbox::log_push::spawn_log_push_task( + let (tx, handle) = openshell_supervisor_process::log_push::spawn_log_push_task( endpoint.clone(), sandbox_id.clone(), ); - let layer = openshell_sandbox::log_push::LogPushLayer::new(sandbox_id.clone(), tx); + let layer = + openshell_supervisor_process::log_push::LogPushLayer::new(sandbox_id.clone(), tx); Some((layer, handle)) } else { None @@ -307,6 +355,8 @@ fn main() -> Result<()> { args.health_port, args.inference_routes, ocsf_enabled, + args.mode.network, + args.mode.process, ) .await })?; diff --git a/crates/openshell-supervisor-network/Cargo.toml b/crates/openshell-supervisor-network/Cargo.toml new file mode 100644 index 000000000..44db67983 --- /dev/null +++ b/crates/openshell-supervisor-network/Cargo.toml @@ -0,0 +1,55 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[package] +name = "openshell-supervisor-network" +description = "Network component of the OpenShell supervisor: proxy, L7 enforcement, OPA, inference routing, denial aggregator" +version.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true + +[dependencies] +openshell-core = { path = "../openshell-core" } +openshell-ocsf = { path = "../openshell-ocsf" } +openshell-policy = { path = "../openshell-policy" } +openshell-router = { path = "../openshell-router" } + +apollo-parser = { workspace = true } +base64 = { workspace = true } +bytes = { workspace = true } +flate2 = "1" +glob = { workspace = true } +hex = "0.4" +ipnet = "2" +miette = { workspace = true } +rcgen = { workspace = true } +regorus = { version = "0.9", default-features = false, features = ["std", "arc", "glob", "yaml"] } +rustls = { workspace = true } +rustls-pemfile = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +serde_yml = { workspace = true } +sha1 = "0.10" +sha2 = { workspace = true } +thiserror = { workspace = true } +tokio = { workspace = true } +tokio-rustls = { workspace = true } +tracing = { workspace = true } +url = { workspace = true } +uuid = { workspace = true } +webpki-roots = { workspace = true } + +[dev-dependencies] +openshell-core = { path = "../openshell-core", features = ["test-helpers"] } +tempfile = "3" +temp-env = "0.3" +tokio-tungstenite = { workspace = true } +futures = { workspace = true } + +[target.'cfg(unix)'.dev-dependencies] +libc = "0.2" + +[lints] +workspace = true diff --git a/crates/openshell-sandbox/data/sandbox-policy.rego b/crates/openshell-supervisor-network/data/sandbox-policy.rego similarity index 100% rename from crates/openshell-sandbox/data/sandbox-policy.rego rename to crates/openshell-supervisor-network/data/sandbox-policy.rego diff --git a/crates/openshell-sandbox/src/bypass_monitor.rs b/crates/openshell-supervisor-network/src/bypass_monitor.rs similarity index 98% rename from crates/openshell-sandbox/src/bypass_monitor.rs rename to crates/openshell-supervisor-network/src/bypass_monitor.rs index 9e37ef27c..2852fc5e1 100644 --- a/crates/openshell-sandbox/src/bypass_monitor.rs +++ b/crates/openshell-supervisor-network/src/bypass_monitor.rs @@ -16,7 +16,7 @@ //! the monitor logs a one-time warning and returns. The nftables reject rules //! still provide fast-fail UX — the monitor only adds diagnostic visibility. -use crate::denial_aggregator::DenialEvent; +use crate::denial::DenialEvent; use openshell_ocsf::{ ActionId, ActivityId, ConfidenceId, DetectionFindingBuilder, DispositionId, Endpoint, FindingInfo, NetworkActivityBuilder, Process, SeverityId, ocsf_emit, @@ -130,7 +130,7 @@ pub fn spawn( .status(); if !dmesg_check.is_ok_and(|s| s.success()) { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .severity(SeverityId::Low) .message( @@ -158,7 +158,7 @@ pub fn spawn( { Ok(c) => c, Err(e) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .severity(SeverityId::Low) .message(format!( @@ -171,7 +171,7 @@ pub fn spawn( }; let Some(stdout) = child.stdout.take() else { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .severity(SeverityId::Low) .message("dmesg --follow produced no stdout; bypass monitor will not run") @@ -214,7 +214,7 @@ pub fn spawn( Endpoint::from_domain(&event.dst_addr, event.dst_port) }; - let net_event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let net_event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Refuse) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -230,7 +230,7 @@ pub fn spawn( .build(); ocsf_emit!(net_event); - let finding_event = DetectionFindingBuilder::new(crate::ocsf_ctx()) + let finding_event = DetectionFindingBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) diff --git a/crates/openshell-supervisor-network/src/denial.rs b/crates/openshell-supervisor-network/src/denial.rs new file mode 100644 index 000000000..ac94b3725 --- /dev/null +++ b/crates/openshell-supervisor-network/src/denial.rs @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Cross-component denial event type. +//! +//! `DenialEvent` is emitted by the supervisor's networking proxy (on L4/L7 +//! deny) and by the bypass monitor (on direct-connect attempts that bypass +//! the proxy). It is consumed by the networking-side denial aggregator that +//! deduplicates and flushes summaries to the gateway. + +/// A single denial event emitted by the proxy or the bypass monitor. +#[derive(Debug, Clone)] +pub struct DenialEvent { + /// Destination host that was denied. + pub host: String, + /// Destination port that was denied. + pub port: u16, + /// Binary path that initiated the connection (if resolved). + pub binary: String, + /// Ancestor binary paths from process tree walk. + pub ancestors: Vec, + /// Reason for denial (e.g. "no matching policy", "internal address"). + pub deny_reason: String, + /// Denial stage: "connect", "forward", "ssrf", "l7", "bypass". + pub denial_stage: String, + /// L7 request method (if this is an L7 denial). + pub l7_method: Option, + /// L7 target path. + pub l7_path: Option, +} diff --git a/crates/openshell-sandbox/src/denial_aggregator.rs b/crates/openshell-supervisor-network/src/denial_aggregator.rs similarity index 89% rename from crates/openshell-sandbox/src/denial_aggregator.rs rename to crates/openshell-supervisor-network/src/denial_aggregator.rs index 5d41adffd..2c8ceb4d3 100644 --- a/crates/openshell-sandbox/src/denial_aggregator.rs +++ b/crates/openshell-supervisor-network/src/denial_aggregator.rs @@ -14,26 +14,7 @@ use std::future::Future; use tokio::sync::mpsc; use tracing::debug; -/// A single denial event emitted by the proxy. -#[derive(Debug, Clone)] -pub struct DenialEvent { - /// Destination host that was denied. - pub host: String, - /// Destination port that was denied. - pub port: u16, - /// Binary path that initiated the connection (if resolved). - pub binary: String, - /// Ancestor binary paths from process tree walk. - pub ancestors: Vec, - /// Reason for denial (e.g. "no matching policy", "internal address"). - pub deny_reason: String, - /// Denial stage: "connect", "forward", "ssrf", "l7", "bypass". - pub denial_stage: String, - /// L7 request details (method, path, decision) if this is an L7 denial. - pub l7_method: Option, - /// L7 target path. - pub l7_path: Option, -} +use crate::denial::DenialEvent; /// Aggregated denial summary keyed by `(host, port, binary)`. #[derive(Debug, Clone)] diff --git a/crates/openshell-sandbox/src/identity.rs b/crates/openshell-supervisor-network/src/identity.rs similarity index 99% rename from crates/openshell-sandbox/src/identity.rs rename to crates/openshell-supervisor-network/src/identity.rs index 49809f95b..fce568f41 100644 --- a/crates/openshell-sandbox/src/identity.rs +++ b/crates/openshell-supervisor-network/src/identity.rs @@ -79,6 +79,12 @@ pub struct BinaryIdentityCache { hashes: Mutex>, } +impl Default for BinaryIdentityCache { + fn default() -> Self { + Self::new() + } +} + impl BinaryIdentityCache { pub fn new() -> Self { Self { diff --git a/crates/openshell-supervisor-network/src/inference_routes.rs b/crates/openshell-supervisor-network/src/inference_routes.rs new file mode 100644 index 000000000..c9f9983ee --- /dev/null +++ b/crates/openshell-supervisor-network/src/inference_routes.rs @@ -0,0 +1,737 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Inference route bundle resolution and refresh. +//! +//! Resolves inference routes from one of two sources at sandbox startup: +//! a local YAML file (`--inference-routes`) or a cluster bundle fetched via +//! gRPC. Builds the [`InferenceContext`] consumed by the proxy's L7 layer +//! and spawns a background refresh loop in cluster mode so route changes +//! propagate without restarting the sandbox. +//! +//! Distinct from [`crate::l7::inference`], which parses HTTP requests and +//! matches them against API patterns at request time. +//! +//! [`InferenceContext`]: crate::proxy::InferenceContext + +use std::sync::Arc; +use std::time::Duration; + +use miette::Result; +use tracing::{info, trace, warn}; + +use openshell_ocsf::{ + ConfigStateChangeBuilder, SeverityId, StateId, StatusId, ctx::ctx as ocsf_ctx, ocsf_emit, +}; + +/// Default interval (seconds) for re-fetching the inference route bundle from +/// the gateway in cluster mode. +/// +/// Override at runtime with the `OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS` +/// environment variable. File-based routes (`--inference-routes`) are loaded +/// once at startup and never refreshed. +pub const DEFAULT_ROUTE_REFRESH_INTERVAL_SECS: u64 = 5; + +/// Route name for the sandbox system inference route. +const SANDBOX_SYSTEM_ROUTE_NAME: &str = "sandbox-system"; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum InferenceRouteSource { + File, + Cluster, + None, +} + +pub fn infer_route_source( + sandbox_id: Option<&str>, + openshell_endpoint: Option<&str>, + inference_routes: Option<&str>, +) -> InferenceRouteSource { + if inference_routes.is_some() { + InferenceRouteSource::File + } else if sandbox_id.is_some() && openshell_endpoint.is_some() { + InferenceRouteSource::Cluster + } else { + InferenceRouteSource::None + } +} + +pub fn disable_inference_on_empty_routes(source: InferenceRouteSource) -> bool { + !matches!(source, InferenceRouteSource::Cluster) +} + +pub fn route_refresh_interval_secs() -> u64 { + let Ok(value) = std::env::var("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS") else { + return DEFAULT_ROUTE_REFRESH_INTERVAL_SECS; + }; + match value.parse::() { + Ok(interval) if interval > 0 => interval, + Ok(_) => { + warn!( + default_interval_secs = DEFAULT_ROUTE_REFRESH_INTERVAL_SECS, + "Ignoring zero route refresh interval" + ); + DEFAULT_ROUTE_REFRESH_INTERVAL_SECS + } + Err(error) => { + warn!( + interval = %value, + error = %error, + default_interval_secs = DEFAULT_ROUTE_REFRESH_INTERVAL_SECS, + "Ignoring invalid route refresh interval" + ); + DEFAULT_ROUTE_REFRESH_INTERVAL_SECS + } + } +} + +/// Build an [`InferenceContext`](crate::proxy::InferenceContext) by resolving +/// inference routes from either a local YAML file or the gateway bundle. +/// +/// If both a routes file and cluster credentials are provided, the routes file +/// wins and the cluster bundle is not fetched. +/// +/// Returns `None` if neither source is configured (inference routing disabled). +/// +/// # Errors +/// +/// Returns an error if loading the routes file fails or the file's routes +/// cannot be resolved. gRPC errors are swallowed (logged) and produce +/// `Ok(None)` so a missing cluster bundle disables inference routing rather +/// than aborting sandbox startup. +// `routes`/`router` are intentionally distinct nouns (the route list vs the +// router that consumes them); both names are clearer than alternatives. +#[allow(clippy::similar_names)] +pub async fn build_inference_context( + sandbox_id: Option<&str>, + openshell_endpoint: Option<&str>, + inference_routes: Option<&str>, +) -> Result>> { + use openshell_router::Router; + use openshell_router::config::RouterConfig; + + let source = infer_route_source(sandbox_id, openshell_endpoint, inference_routes); + + // Captured during the initial cluster bundle fetch so the background refresh + // loop can skip no-op updates from the very first tick. + let mut initial_revision: Option = None; + + let routes = match source { + InferenceRouteSource::File => { + let Some(path) = inference_routes else { + return Ok(None); + }; + + // Standalone mode: load routes from file (fail-fast on errors) + if sandbox_id.is_some() { + ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Enabled, "loaded") + .unmapped("inference_routes", serde_json::json!(path)) + .message(format!( + "Inference routes file takes precedence over cluster bundle [path:{path}]" + )) + .build()); + } + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Other, "loading") + .unmapped("inference_routes", serde_json::json!(path)) + .message(format!("Loading inference routes from file [path:{path}]")) + .build() + ); + let config = RouterConfig::load_from_file(std::path::Path::new(path)) + .map_err(|e| miette::miette!("failed to load inference routes {path}: {e}"))?; + config + .resolve_routes() + .map_err(|e| miette::miette!("failed to resolve routes from {path}: {e}"))? + } + InferenceRouteSource::Cluster => { + let (Some(_id), Some(endpoint)) = (sandbox_id, openshell_endpoint) else { + return Ok(None); + }; + + // Cluster mode: fetch bundle from gateway + info!(endpoint = %endpoint, "Fetching inference route bundle from gateway"); + match openshell_core::grpc_client::fetch_inference_bundle(endpoint).await { + Ok(bundle) => { + initial_revision = Some(bundle.revision.clone()); + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Enabled, "loaded") + .unmapped("route_count", serde_json::json!(bundle.routes.len())) + .unmapped("revision", serde_json::json!(&bundle.revision)) + .message(format!( + "Loaded inference route bundle [route_count:{} revision:{}]", + bundle.routes.len(), + bundle.revision + )) + .build() + ); + bundle_to_resolved_routes(&bundle) + } + Err(e) => { + // Distinguish expected "not configured" states from server errors. + // gRPC PermissionDenied/NotFound means inference bundle is unavailable + // for this sandbox — skip gracefully. Other errors are unexpected. + let msg = e.to_string(); + if msg.contains("permission denied") || msg.contains("not found") { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Disabled, "disabled") + .unmapped("error", serde_json::json!(e.to_string())) + .message(format!( + "Inference bundle unavailable, routing disabled [error:{e}]" + )) + .build() + ); + return Ok(None); + } + ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .state(StateId::Disabled, "disabled") + .unmapped("error", serde_json::json!(e.to_string())) + .message(format!( + "Failed to fetch inference bundle, inference routing disabled [error:{e}]" + )) + .build()); + return Ok(None); + } + } + } + InferenceRouteSource::None => { + // No route source — inference routing is not configured + return Ok(None); + } + }; + + if routes.is_empty() && disable_inference_on_empty_routes(source) { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Disabled, "disabled") + .message("No usable inference routes, inference routing disabled") + .build() + ); + return Ok(None); + } + + if routes.is_empty() { + ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Other, "waiting") + .message("Inference route bundle is empty; keeping routing enabled and waiting for refresh") + .build()); + } + + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Enabled, "enabled") + .unmapped("route_count", serde_json::json!(routes.len())) + .message(format!( + "Inference routing enabled with local execution [route_count:{}]", + routes.len() + )) + .build() + ); + + // Partition routes by name into user-facing and system caches. + let (user_routes, system_routes) = partition_routes(routes); + + let router = + Router::new().map_err(|e| miette::miette!("failed to initialize inference router: {e}"))?; + let patterns = crate::l7::inference::default_patterns(); + + let ctx = Arc::new(crate::proxy::InferenceContext::new( + patterns, + router, + user_routes, + system_routes, + )); + + // Spawn background route cache refresh for cluster mode at startup so + // request handling never depends on control-plane latency. + if matches!(source, InferenceRouteSource::Cluster) + && let (Some(_id), Some(endpoint)) = (sandbox_id, openshell_endpoint) + { + spawn_route_refresh( + ctx.route_cache(), + ctx.system_route_cache(), + endpoint.to_string(), + route_refresh_interval_secs(), + initial_revision, + ); + } + + Ok(Some(ctx)) +} + +/// Split resolved routes into user-facing and system caches by route name. +/// +/// Routes named `"sandbox-system"` go to the system cache; everything else +/// (including `"inference.local"` and empty names) goes to the user cache. +pub fn partition_routes( + routes: Vec, +) -> ( + Vec, + Vec, +) { + let mut user = Vec::new(); + let mut system = Vec::new(); + for r in routes { + if r.name == SANDBOX_SYSTEM_ROUTE_NAME { + system.push(r); + } else { + user.push(r); + } + } + (user, system) +} + +/// Convert a proto bundle response into resolved routes for the router. +pub fn bundle_to_resolved_routes( + bundle: &openshell_core::proto::GetInferenceBundleResponse, +) -> Vec { + bundle + .routes + .iter() + .map(|r| { + let (auth, default_headers, passthrough_headers) = + openshell_core::inference::route_headers_for_provider_type(&r.provider_type); + let timeout = if r.timeout_secs == 0 { + openshell_router::config::DEFAULT_ROUTE_TIMEOUT + } else { + Duration::from_secs(r.timeout_secs) + }; + openshell_router::config::ResolvedRoute { + name: r.name.clone(), + endpoint: r.base_url.clone(), + model: r.model_id.clone(), + api_key: r.api_key.clone(), + protocols: r.protocols.clone(), + auth, + default_headers, + passthrough_headers, + timeout, + } + }) + .collect() +} + +/// Spawn a background task that periodically refreshes both route caches from the gateway. +/// +/// The loop uses the bundle `revision` hash to avoid unnecessary cache writes +/// when routes haven't changed. `initial_revision` is the revision captured +/// during the startup fetch in [`build_inference_context`] so the first refresh +/// cycle can already skip a no-op update. +pub fn spawn_route_refresh( + user_cache: Arc>>, + system_cache: Arc>>, + endpoint: String, + interval_secs: u64, + initial_revision: Option, +) { + tokio::spawn(async move { + use tokio::time::{MissedTickBehavior, interval}; + + let mut current_revision = initial_revision; + + let mut tick = interval(Duration::from_secs(interval_secs)); + tick.set_missed_tick_behavior(MissedTickBehavior::Skip); + + loop { + tick.tick().await; + + match openshell_core::grpc_client::fetch_inference_bundle(&endpoint).await { + Ok(bundle) => { + if current_revision.as_deref() == Some(&bundle.revision) { + trace!(revision = %bundle.revision, "Inference bundle unchanged"); + continue; + } + + let routes = bundle_to_resolved_routes(&bundle); + let (user_routes, system_routes) = partition_routes(routes); + ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Enabled, "updated") + .unmapped("user_route_count", serde_json::json!(user_routes.len())) + .unmapped("system_route_count", serde_json::json!(system_routes.len())) + .unmapped("revision", serde_json::json!(&bundle.revision)) + .message(format!( + "Inference routes updated [user_route_count:{} system_route_count:{} revision:{}]", + user_routes.len(), + system_routes.len(), + bundle.revision + )) + .build()); + current_revision = Some(bundle.revision); + *user_cache.write().await = user_routes; + *system_cache.write().await = system_routes; + } + Err(e) => { + ocsf_emit!(ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .state(StateId::Other, "stale") + .unmapped("error", serde_json::json!(e.to_string())) + .message(format!( + "Failed to refresh inference route cache, keeping stale routes [error:{e}]" + )) + .build()); + } + } + } + }); +} + +#[cfg(test)] +#[allow( + clippy::needless_raw_string_hashes, + clippy::similar_names, + reason = "Test code: test fixtures often use idiomatic forms not flagged in production." +)] +mod tests { + use super::*; + use std::sync::{LazyLock, Mutex}; + use temp_env::with_vars; + + static ENV_LOCK: LazyLock> = LazyLock::new(|| Mutex::new(())); + + #[test] + fn bundle_to_resolved_routes_converts_all_fields() { + let bundle = openshell_core::proto::GetInferenceBundleResponse { + routes: vec![ + openshell_core::proto::ResolvedRoute { + name: "frontier".to_string(), + base_url: "https://api.example.com/v1".to_string(), + api_key: "sk-test-key".to_string(), + model_id: "gpt-4".to_string(), + protocols: vec![ + "openai_chat_completions".to_string(), + "openai_responses".to_string(), + ], + provider_type: "openai".to_string(), + timeout_secs: 0, + }, + openshell_core::proto::ResolvedRoute { + name: "local".to_string(), + base_url: "http://vllm:8000/v1".to_string(), + api_key: "local-key".to_string(), + model_id: "llama-3".to_string(), + protocols: vec!["openai_chat_completions".to_string()], + provider_type: String::new(), + timeout_secs: 120, + }, + ], + revision: "abc123".to_string(), + generated_at_ms: 1000, + }; + + let routes = bundle_to_resolved_routes(&bundle); + + assert_eq!(routes.len(), 2); + assert_eq!(routes[0].endpoint, "https://api.example.com/v1"); + assert_eq!(routes[0].model, "gpt-4"); + assert_eq!(routes[0].api_key, "sk-test-key"); + assert_eq!( + routes[0].auth, + openshell_core::inference::AuthHeader::Bearer + ); + assert_eq!( + routes[0].protocols, + vec!["openai_chat_completions", "openai_responses"] + ); + assert_eq!( + routes[0].timeout, + openshell_router::config::DEFAULT_ROUTE_TIMEOUT, + "timeout_secs=0 should map to default" + ); + assert_eq!(routes[1].endpoint, "http://vllm:8000/v1"); + assert_eq!( + routes[1].auth, + openshell_core::inference::AuthHeader::Bearer + ); + assert_eq!( + routes[1].timeout, + Duration::from_secs(120), + "timeout_secs=120 should map to 120s" + ); + } + + #[test] + fn bundle_to_resolved_routes_handles_empty_bundle() { + let bundle = openshell_core::proto::GetInferenceBundleResponse { + routes: vec![], + revision: "empty".to_string(), + generated_at_ms: 0, + }; + + let routes = bundle_to_resolved_routes(&bundle); + assert!(routes.is_empty()); + } + + #[test] + fn bundle_to_resolved_routes_preserves_name_field() { + let bundle = openshell_core::proto::GetInferenceBundleResponse { + routes: vec![openshell_core::proto::ResolvedRoute { + name: "sandbox-system".to_string(), + base_url: "https://api.example.com/v1".to_string(), + api_key: "key".to_string(), + model_id: "model".to_string(), + protocols: vec!["openai_chat_completions".to_string()], + provider_type: "openai".to_string(), + timeout_secs: 0, + }], + revision: "rev".to_string(), + generated_at_ms: 0, + }; + + let routes = bundle_to_resolved_routes(&bundle); + assert_eq!(routes[0].name, "sandbox-system"); + } + + #[test] + fn routes_segregated_by_name() { + let routes = vec![ + openshell_router::config::ResolvedRoute { + name: "inference.local".to_string(), + endpoint: "https://api.openai.com/v1".to_string(), + model: "gpt-4o".to_string(), + api_key: "key1".to_string(), + protocols: vec!["openai_chat_completions".to_string()], + auth: openshell_core::inference::AuthHeader::Bearer, + default_headers: vec![], + passthrough_headers: vec![], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, + }, + openshell_router::config::ResolvedRoute { + name: "sandbox-system".to_string(), + endpoint: "https://api.anthropic.com/v1".to_string(), + model: "claude-sonnet-4-20250514".to_string(), + api_key: "key2".to_string(), + protocols: vec!["anthropic_messages".to_string()], + auth: openshell_core::inference::AuthHeader::Custom("x-api-key"), + default_headers: vec![], + passthrough_headers: vec![], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, + }, + ]; + + let (user, system) = partition_routes(routes); + assert_eq!(user.len(), 1); + assert_eq!(user[0].name, "inference.local"); + assert_eq!(system.len(), 1); + assert_eq!(system[0].name, "sandbox-system"); + } + + // -- build_inference_context tests -- + + #[tokio::test] + async fn build_inference_context_route_file_loads_routes() { + use std::io::Write; + + let yaml = r#" +routes: + - name: inference.local + endpoint: http://localhost:8000/v1 + model: llama-3 + protocols: [openai_chat_completions] + api_key: test-key +"#; + let mut f = tempfile::NamedTempFile::new().unwrap(); + f.write_all(yaml.as_bytes()).unwrap(); + let path = f.path().to_str().unwrap(); + + let ctx = build_inference_context(None, None, Some(path)) + .await + .expect("should load routes from file"); + + let ctx = ctx.expect("context should be Some"); + let cache = ctx.route_cache(); + let routes = cache.read().await; + assert_eq!(routes.len(), 1); + assert_eq!(routes[0].endpoint, "http://localhost:8000/v1"); + } + + #[tokio::test] + async fn build_inference_context_empty_route_file_returns_none() { + use std::io::Write; + + // Route file with empty routes list → inference routing disabled (not an error) + let yaml = "routes: []\n"; + let mut f = tempfile::NamedTempFile::new().unwrap(); + f.write_all(yaml.as_bytes()).unwrap(); + let path = f.path().to_str().unwrap(); + + let ctx = build_inference_context(None, None, Some(path)) + .await + .expect("empty routes file should not error"); + assert!( + ctx.is_none(), + "empty routes should disable inference routing" + ); + } + + #[tokio::test] + async fn build_inference_context_no_sources_returns_none() { + let ctx = build_inference_context(None, None, None) + .await + .expect("should succeed with None"); + + assert!(ctx.is_none(), "no sources should return None"); + } + + #[tokio::test] + async fn build_inference_context_route_file_overrides_cluster() { + use std::io::Write; + + let yaml = r#" +routes: + - name: inference.local + endpoint: http://localhost:9999/v1 + model: file-model + protocols: [openai_chat_completions] + api_key: file-key +"#; + let mut f = tempfile::NamedTempFile::new().unwrap(); + f.write_all(yaml.as_bytes()).unwrap(); + let path = f.path().to_str().unwrap(); + + // Even with sandbox_id and endpoint, route_file takes precedence + let ctx = build_inference_context(Some("sb-1"), Some("http://localhost:50051"), Some(path)) + .await + .expect("should load from file"); + + let ctx = ctx.expect("context should be Some"); + let cache = ctx.route_cache(); + let routes = cache.read().await; + assert_eq!(routes[0].endpoint, "http://localhost:9999/v1"); + } + + #[test] + fn infer_route_source_prefers_file_mode() { + assert_eq!( + infer_route_source( + Some("sb-1"), + Some("http://localhost:50051"), + Some("routes.yaml") + ), + InferenceRouteSource::File + ); + } + + #[test] + fn infer_route_source_cluster_requires_id_and_endpoint() { + assert_eq!( + infer_route_source(Some("sb-1"), Some("http://localhost:50051"), None), + InferenceRouteSource::Cluster + ); + assert_eq!( + infer_route_source(Some("sb-1"), None, None), + InferenceRouteSource::None + ); + assert_eq!( + infer_route_source(None, Some("http://localhost:50051"), None), + InferenceRouteSource::None + ); + } + + #[test] + fn disable_inference_on_empty_routes_depends_on_source() { + assert!(disable_inference_on_empty_routes( + InferenceRouteSource::File + )); + assert!(!disable_inference_on_empty_routes( + InferenceRouteSource::Cluster + )); + assert!(disable_inference_on_empty_routes( + InferenceRouteSource::None + )); + } + + // ---- Route refresh interval + revision tests ---- + + #[test] + fn default_route_refresh_interval_is_five_seconds() { + assert_eq!(DEFAULT_ROUTE_REFRESH_INTERVAL_SECS, 5); + } + + #[test] + fn route_refresh_interval_uses_env_override() { + let _guard = ENV_LOCK.lock().unwrap(); + with_vars( + [("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS", Some("9"))], + || { + assert_eq!(route_refresh_interval_secs(), 9); + }, + ); + } + + #[test] + fn route_refresh_interval_rejects_zero() { + let _guard = ENV_LOCK.lock().unwrap(); + with_vars( + [("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS", Some("0"))], + || { + assert_eq!( + route_refresh_interval_secs(), + DEFAULT_ROUTE_REFRESH_INTERVAL_SECS + ); + }, + ); + } + + #[test] + fn route_refresh_interval_rejects_invalid_values() { + let _guard = ENV_LOCK.lock().unwrap(); + with_vars( + [("OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS", Some("abc"))], + || { + assert_eq!( + route_refresh_interval_secs(), + DEFAULT_ROUTE_REFRESH_INTERVAL_SECS + ); + }, + ); + } + + #[tokio::test] + async fn route_cache_preserves_content_when_not_written() { + use std::sync::Arc; + use tokio::sync::RwLock; + + let routes = vec![openshell_router::config::ResolvedRoute { + name: "inference.local".to_string(), + endpoint: "http://original:8000/v1".to_string(), + model: "original-model".to_string(), + api_key: "key".to_string(), + auth: openshell_core::inference::AuthHeader::Bearer, + protocols: vec!["openai_chat_completions".to_string()], + default_headers: vec![], + passthrough_headers: vec![], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, + }]; + + let cache = Arc::new(RwLock::new(routes)); + + // Verify the cache preserves its content — the revision-based skip + // logic in spawn_route_refresh ensures the cache is only written + // when the revision actually changes. + let read = cache.read().await; + assert_eq!(read.len(), 1); + assert_eq!(read[0].model, "original-model"); + } +} diff --git a/crates/openshell-sandbox/src/l7/graphql.rs b/crates/openshell-supervisor-network/src/l7/graphql.rs similarity index 100% rename from crates/openshell-sandbox/src/l7/graphql.rs rename to crates/openshell-supervisor-network/src/l7/graphql.rs diff --git a/crates/openshell-sandbox/src/l7/inference.rs b/crates/openshell-supervisor-network/src/l7/inference.rs similarity index 100% rename from crates/openshell-sandbox/src/l7/inference.rs rename to crates/openshell-supervisor-network/src/l7/inference.rs diff --git a/crates/openshell-sandbox/src/l7/mod.rs b/crates/openshell-supervisor-network/src/l7/mod.rs similarity index 99% rename from crates/openshell-sandbox/src/l7/mod.rs rename to crates/openshell-supervisor-network/src/l7/mod.rs index 703aafae4..31c2071e0 100644 --- a/crates/openshell-sandbox/src/l7/mod.rs +++ b/crates/openshell-supervisor-network/src/l7/mod.rs @@ -122,7 +122,7 @@ pub fn parse_l7_config(val: ®orus::Value) -> Option { let tls = match get_object_str(val, "tls").as_deref() { Some("skip") => TlsMode::Skip, Some("terminate") => { - let event = openshell_ocsf::NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = openshell_ocsf::NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(openshell_ocsf::ActivityId::Other) .severity(openshell_ocsf::SeverityId::Medium) .message( @@ -134,7 +134,7 @@ pub fn parse_l7_config(val: ®orus::Value) -> Option { TlsMode::Auto } Some("passthrough") => { - let event = openshell_ocsf::NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = openshell_ocsf::NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(openshell_ocsf::ActivityId::Other) .severity(openshell_ocsf::SeverityId::Medium) .message( diff --git a/crates/openshell-sandbox/src/l7/path.rs b/crates/openshell-supervisor-network/src/l7/path.rs similarity index 100% rename from crates/openshell-sandbox/src/l7/path.rs rename to crates/openshell-supervisor-network/src/l7/path.rs diff --git a/crates/openshell-sandbox/src/l7/provider.rs b/crates/openshell-supervisor-network/src/l7/provider.rs similarity index 100% rename from crates/openshell-sandbox/src/l7/provider.rs rename to crates/openshell-supervisor-network/src/l7/provider.rs diff --git a/crates/openshell-sandbox/src/l7/relay.rs b/crates/openshell-supervisor-network/src/l7/relay.rs similarity index 99% rename from crates/openshell-sandbox/src/l7/relay.rs rename to crates/openshell-supervisor-network/src/l7/relay.rs index 6d271af21..a20769493 100644 --- a/crates/openshell-sandbox/src/l7/relay.rs +++ b/crates/openshell-supervisor-network/src/l7/relay.rs @@ -11,8 +11,8 @@ use crate::l7::provider::{L7Provider, RelayOutcome}; use crate::l7::rest::WebSocketExtensionMode; use crate::l7::{EnforcementMode, L7EndpointConfig, L7Protocol, L7RequestInfo}; use crate::opa::{PolicyGenerationGuard, TunnelPolicyEngine}; -use crate::secrets::{self, SecretResolver}; use miette::{IntoDiagnostic, Result, miette}; +use openshell_core::secrets::{self, SecretResolver}; use openshell_ocsf::{ ActionId, ActivityId, DispositionId, Endpoint, HttpActivityBuilder, HttpRequest, NetworkActivityBuilder, SeverityId, StatusId, Url as OcsfUrl, ocsf_emit, @@ -104,7 +104,7 @@ fn emit_parse_rejection(ctx: &L7EvalContext, detail: &str, engine_type: &str) { } else { &ctx.policy_name }; - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -150,7 +150,7 @@ where } // SQL provider is Phase 3 — fall through to passthrough with warning { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .severity(SeverityId::Low) .dst_endpoint(Endpoint::from_domain(&ctx.host, ctx.port)) @@ -431,7 +431,7 @@ fn emit_l7_request_log( let summary = graphql_info .map(|info| format!(" {}", graphql_log_summary(info))) .unwrap_or_default(); - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(action_id) .disposition(disposition_id) @@ -477,7 +477,7 @@ where "raw bidirectional relay (L7 enforcement no longer active)" }; ocsf_emit!( - NetworkActivityBuilder::new(crate::ocsf_ctx()) + NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .activity_name("Upgrade") .severity(SeverityId::Informational) @@ -735,7 +735,7 @@ where SeverityId::Informational, ), }; - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(action_id) .disposition(disposition_id) @@ -828,7 +828,7 @@ fn close_if_stale(guard: &PolicyGenerationGuard, ctx: &L7EvalContext) -> bool { } ocsf_emit!( - NetworkActivityBuilder::new(crate::ocsf_ctx()) + NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -968,7 +968,7 @@ where ), }; let gql_summary = graphql_log_summary(&graphql_info); - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(action_id) .disposition(disposition_id) @@ -1229,7 +1229,7 @@ where // Uses redacted_target (path only, no query params) to avoid logging secrets. let has_creds = resolver.is_some(); { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Allowed) .disposition(DispositionId::Allowed) diff --git a/crates/openshell-sandbox/src/l7/rest.rs b/crates/openshell-supervisor-network/src/l7/rest.rs similarity index 99% rename from crates/openshell-sandbox/src/l7/rest.rs rename to crates/openshell-supervisor-network/src/l7/rest.rs index 20d52459c..4f46d24ba 100644 --- a/crates/openshell-sandbox/src/l7/rest.rs +++ b/crates/openshell-supervisor-network/src/l7/rest.rs @@ -9,11 +9,11 @@ use crate::l7::provider::{BodyLength, L7Provider, L7Request, RelayOutcome}; use crate::opa::PolicyGenerationGuard; -use crate::secrets::{ - SecretResolver, contains_reserved_credential_marker, rewrite_http_header_block, -}; use base64::Engine as _; use miette::{IntoDiagnostic, Result, miette}; +use openshell_core::secrets::{ + SecretResolver, contains_reserved_credential_marker, rewrite_http_header_block, +}; use sha1::{Digest, Sha1}; use std::collections::{HashMap, HashSet}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; @@ -2001,8 +2001,8 @@ fn is_benign_close(err: &std::io::Error) -> bool { mod tests { use super::*; use crate::opa::OpaEngine; - use crate::secrets::SecretResolver; use flate2::{Compress, Compression, Decompress, FlushCompress, FlushDecompress, Status}; + use openshell_core::secrets::SecretResolver; use std::sync::Arc; const TEST_POLICY: &str = include_str!("../../data/sandbox-policy.rego"); @@ -2285,7 +2285,8 @@ mod tests { #[test] fn deny_response_body_is_agent_readable_and_redacted() { // Agent-readable next_steps is gated on the proposals feature flag. - let _proposals = crate::test_helpers::ProposalsFlagGuard::set_blocking(true); + let _proposals = + openshell_core::proposals::test_helpers::ProposalsFlagGuard::set_blocking(true); let req = L7Request { action: "PUT".to_string(), target: "/repos/NVIDIA/OpenShell/contents/README.md?access_token=secret-token" @@ -2350,7 +2351,8 @@ mod tests { #[test] fn deny_response_body_omits_agent_guidance_when_policy_advisor_is_off() { - let _proposals = crate::test_helpers::ProposalsFlagGuard::set_blocking(false); + let _proposals = + openshell_core::proposals::test_helpers::ProposalsFlagGuard::set_blocking(false); let req = L7Request { action: "GET".to_string(), target: "/gists".to_string(), @@ -2382,7 +2384,8 @@ mod tests { #[tokio::test] async fn send_deny_response_writes_structured_json_403() { // Agent-readable next_steps is gated on the proposals feature flag. - let _proposals = crate::test_helpers::ProposalsFlagGuard::set(true).await; + let _proposals = + openshell_core::proposals::test_helpers::ProposalsFlagGuard::set(true).await; let (mut client, mut server) = tokio::io::duplex(4096); let send = tokio::spawn(async move { let req = L7Request { diff --git a/crates/openshell-sandbox/src/l7/tls.rs b/crates/openshell-supervisor-network/src/l7/tls.rs similarity index 100% rename from crates/openshell-sandbox/src/l7/tls.rs rename to crates/openshell-supervisor-network/src/l7/tls.rs diff --git a/crates/openshell-sandbox/src/l7/websocket.rs b/crates/openshell-supervisor-network/src/l7/websocket.rs similarity index 99% rename from crates/openshell-sandbox/src/l7/websocket.rs rename to crates/openshell-supervisor-network/src/l7/websocket.rs index 2dc1b25c3..c965aacf5 100644 --- a/crates/openshell-sandbox/src/l7/websocket.rs +++ b/crates/openshell-supervisor-network/src/l7/websocket.rs @@ -9,9 +9,9 @@ use crate::l7::relay::{L7EvalContext, evaluate_l7_request}; use crate::l7::{EnforcementMode, L7RequestInfo}; use crate::opa::TunnelPolicyEngine; -use crate::secrets::SecretResolver; use flate2::{Compress, Compression, Decompress, FlushCompress, FlushDecompress, Status}; use miette::{IntoDiagnostic, Result, miette}; +use openshell_core::secrets::SecretResolver; use openshell_ocsf::{ ActionId, ActivityId, DispositionId, Endpoint, NetworkActivityBuilder, SeverityId, StatusId, ocsf_emit, @@ -954,7 +954,7 @@ fn emit_rewrite_event(host: &str, port: u16, policy_name: &str, replacements: us } else { policy_name }; - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Allowed) .disposition(DispositionId::Allowed) @@ -1001,7 +1001,7 @@ fn emit_websocket_l7_event( ), }; let summary = graphql.map(graphql_log_summary).unwrap_or_default(); - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(action_id) .disposition(disposition_id) @@ -1082,7 +1082,7 @@ fn emit_protocol_failure(host: &str, port: u16, policy_name: &str, failure_class } else { policy_name }; - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -1105,7 +1105,7 @@ mod tests { use super::*; use crate::l7::relay::L7EvalContext; use crate::opa::{NetworkInput, OpaEngine}; - use crate::secrets::SecretResolver; + use openshell_core::secrets::SecretResolver; use std::path::PathBuf; use tokio::io::{AsyncReadExt, AsyncWriteExt}; diff --git a/crates/openshell-supervisor-network/src/lib.rs b/crates/openshell-supervisor-network/src/lib.rs new file mode 100644 index 000000000..ddb525238 --- /dev/null +++ b/crates/openshell-supervisor-network/src/lib.rs @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Networking component of the `OpenShell` supervisor. +//! +//! Owns the egress proxy, L7 enforcement, OPA policy engine, identity cache, +//! inference routing, TLS interception, and denial aggregation. Populated by +//! follow-up commits as modules migrate out of `openshell-sandbox`. + +pub mod bypass_monitor; +pub mod denial; +pub mod denial_aggregator; +pub mod identity; +pub mod inference_routes; +pub mod l7; +pub mod mechanistic_mapper; +pub mod opa; +pub mod policy_local; +pub mod procfs; +pub mod proxy; +pub mod run; diff --git a/crates/openshell-sandbox/src/mechanistic_mapper.rs b/crates/openshell-supervisor-network/src/mechanistic_mapper.rs similarity index 100% rename from crates/openshell-sandbox/src/mechanistic_mapper.rs rename to crates/openshell-supervisor-network/src/mechanistic_mapper.rs diff --git a/crates/openshell-sandbox/src/opa.rs b/crates/openshell-supervisor-network/src/opa.rs similarity index 99% rename from crates/openshell-sandbox/src/opa.rs rename to crates/openshell-supervisor-network/src/opa.rs index f73f3bc14..4dd0350ff 100644 --- a/crates/openshell-sandbox/src/opa.rs +++ b/crates/openshell-supervisor-network/src/opa.rs @@ -7,8 +7,10 @@ //! access decisions. The engine is loaded once at sandbox startup and queried //! on every proxy CONNECT request. -use crate::policy::{FilesystemPolicy, LandlockCompatibility, LandlockPolicy, ProcessPolicy}; use miette::Result; +use openshell_core::policy::{ + FilesystemPolicy, LandlockCompatibility, LandlockPolicy, ProcessPolicy, +}; use openshell_core::proto::SandboxPolicy as ProtoSandboxPolicy; use std::path::{Path, PathBuf}; use std::sync::{ @@ -200,7 +202,7 @@ impl OpaEngine { let (errors, warnings) = crate::l7::validate_l7_policies(&data); for w in &warnings { openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Medium) .status(openshell_ocsf::StatusId::Success) .state(openshell_ocsf::StateId::Enabled, "validated") @@ -726,7 +728,7 @@ fn preprocess_yaml_data(yaml_str: &str) -> Result { let (errors, warnings) = crate::l7::validate_l7_policies(&data); for w in &warnings { openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Medium) .status(openshell_ocsf::StatusId::Success) .state(openshell_ocsf::StateId::Enabled, "validated") diff --git a/crates/openshell-sandbox/src/policy_local.rs b/crates/openshell-supervisor-network/src/policy_local.rs similarity index 98% rename from crates/openshell-sandbox/src/policy_local.rs rename to crates/openshell-supervisor-network/src/policy_local.rs index fcf6e1f8e..b0ff271f6 100644 --- a/crates/openshell-sandbox/src/policy_local.rs +++ b/crates/openshell-supervisor-network/src/policy_local.rs @@ -19,6 +19,7 @@ use tokio::sync::RwLock; pub const POLICY_LOCAL_HOST: &str = "policy.local"; /// Filesystem path of the static agent guidance bundle inside the sandbox. +/// /// Single source of truth: the skill installer writes here, the L7 deny body /// references this path in `next_steps`, and the skill's own documentation /// renders the same path. Changing the location is a one-line update here. @@ -28,7 +29,9 @@ pub const SKILL_PATH: &str = "/etc/openshell/skills/policy_advisor.md"; /// instructions than structured next-step JSON alone. pub const AGENT_GUIDANCE: &str = "OpenShell blocked this request with sandbox policy. If the user's task still needs this network action, do not stop here: read /etc/openshell/skills/policy_advisor.md, submit the narrowest policy proposal to http://policy.local/v1/proposals, wait for approval and `policy_reloaded: true`, then retry the original request."; -/// Routes served by the in-sandbox policy advisor API. Held in one place so +/// Routes served by the in-sandbox policy advisor API. +/// +/// Held in one place so /// the L7 deny `next_steps` array, the route dispatcher, the skill content, /// and tests all stay in sync — change the wire path here and every caller /// follows. See `agent_next_steps()` for the consumer that surfaces these @@ -146,7 +149,7 @@ async fn route_request( // when the flag is off — including the diagnostic `current_policy` and // `denials` routes. The skill is also not installed in that mode, so a // disabled sandbox has no entry point into this API at all. - if !crate::agent_proposals_enabled() { + if !openshell_core::proposals::agent_proposals_enabled() { return ( 404, serde_json::json!({ @@ -209,7 +212,9 @@ fn not_found_payload(path: &str) -> (u16, serde_json::Value) { } /// Build the `next_steps` array embedded in the L7 deny body so the agent has -/// machine-readable pointers to this API. Centralizes the shape here to keep +/// machine-readable pointers to this API. +/// +/// Centralizes the shape here to keep /// the deny body and the actual route table from drifting — adding or /// renaming a route only requires touching the route constants above. /// @@ -218,7 +223,7 @@ fn not_found_payload(path: &str) -> (u16, serde_json::Value) { /// caller still emits the field (with `[]`) so the wire shape is stable. #[must_use] pub fn agent_next_steps() -> serde_json::Value { - if !crate::agent_proposals_enabled() { + if !openshell_core::proposals::agent_proposals_enabled() { return serde_json::json!([]); } let host = POLICY_LOCAL_HOST; @@ -249,7 +254,7 @@ pub fn agent_next_steps() -> serde_json::Value { /// Build the optional natural-language guidance embedded in L7 deny bodies. #[must_use] pub fn agent_guidance() -> Option<&'static str> { - crate::agent_proposals_enabled().then_some(AGENT_GUIDANCE) + openshell_core::proposals::agent_proposals_enabled().then_some(AGENT_GUIDANCE) } async fn current_policy_response(ctx: &PolicyLocalContext) -> (u16, serde_json::Value) { @@ -484,7 +489,7 @@ async fn submit_proposal(ctx: &PolicyLocalContext, body: &[u8]) -> (u16, serde_j Err(error) => return (400, error_payload("invalid_proposal", error)), }; - let client = match crate::grpc_client::CachedOpenShellClient::connect(endpoint).await { + let client = match openshell_core::grpc_client::CachedOpenShellClient::connect(endpoint).await { Ok(client) => client, Err(error) => { return ( @@ -558,7 +563,7 @@ async fn submit_proposal(ctx: &PolicyLocalContext, body: &[u8]) -> (u16, serde_j /// to see in the audit trace to correlate against the inbox card. fn emit_policy_propose_event(chunk_id: &str, summary: &str) { ocsf_emit!( - ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(SeverityId::Informational) .status(StatusId::Success) .state(StateId::Other, "PROPOSED") @@ -578,7 +583,7 @@ fn emit_policy_decision_event(chunk: &PolicyChunk) { let summary = summarize_chunk_for_audit(chunk); match chunk.status.as_str() { "approved" => ocsf_emit!( - ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(SeverityId::Informational) .status(StatusId::Success) .state(StateId::Enabled, "APPROVED") @@ -600,7 +605,7 @@ fn emit_policy_decision_event(chunk: &PolicyChunk) { format!("\"{sanitized}\"") }; ocsf_emit!( - ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(SeverityId::Low) .status(StatusId::Success) .state(StateId::Disabled, "REJECTED") @@ -877,7 +882,7 @@ fn parse_timeout_query(query: &str) -> u64 { /// per request and reused for every `fetch_chunk` call in a wait loop so a /// 60-second wait does one TLS handshake, not sixty. struct LookupSession<'a> { - client: crate::grpc_client::CachedOpenShellClient, + client: openshell_core::grpc_client::CachedOpenShellClient, sandbox_name: &'a str, } @@ -909,7 +914,7 @@ async fn open_lookup_session( ), ) })?; - let client = crate::grpc_client::CachedOpenShellClient::connect(endpoint) + let client = openshell_core::grpc_client::CachedOpenShellClient::connect(endpoint) .await .map_err(|e| (502, error_payload("gateway_connect_failed", e.to_string())))?; Ok(LookupSession { @@ -1568,7 +1573,7 @@ mod tests { assert!(surfaced.ends_with("...[truncated]")); } - use crate::test_helpers::ProposalsFlagGuard; + use openshell_core::proposals::test_helpers::ProposalsFlagGuard; #[test] fn agent_next_steps_returns_empty_when_flag_off() { diff --git a/crates/openshell-sandbox/src/procfs.rs b/crates/openshell-supervisor-network/src/procfs.rs similarity index 100% rename from crates/openshell-sandbox/src/procfs.rs rename to crates/openshell-supervisor-network/src/procfs.rs diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-supervisor-network/src/proxy.rs similarity index 98% rename from crates/openshell-sandbox/src/proxy.rs rename to crates/openshell-supervisor-network/src/proxy.rs index 30466a465..c1210cb8e 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-supervisor-network/src/proxy.rs @@ -3,16 +3,16 @@ //! HTTP CONNECT proxy with OPA policy evaluation and process-identity binding. -use crate::denial_aggregator::DenialEvent; +use crate::denial::DenialEvent; use crate::identity::BinaryIdentityCache; use crate::l7::tls::ProxyTlsState; use crate::opa::{NetworkAction, OpaEngine, PolicyGenerationGuard}; -use crate::policy::ProxyPolicy; use crate::policy_local::{POLICY_LOCAL_HOST, PolicyLocalContext}; -use crate::provider_credentials::ProviderCredentialState; -use crate::secrets::{SecretResolver, rewrite_header_line_checked}; use miette::{IntoDiagnostic, Result}; use openshell_core::net::{is_always_blocked_ip, is_internal_ip, is_link_local_ip}; +use openshell_core::policy::ProxyPolicy; +use openshell_core::provider_credentials::ProviderCredentialState; +use openshell_core::secrets::{SecretResolver, rewrite_header_line_checked}; use openshell_ocsf::{ ActionId, ActivityId, DispositionId, Endpoint, HttpActivityBuilder, HttpRequest, NetworkActivityBuilder, Process, SeverityId, StatusId, Url as OcsfUrl, ocsf_emit, @@ -175,7 +175,7 @@ impl ProxyHandle { /// The proxy uses OPA for network decisions with process-identity binding /// via `/proc/net/tcp`. All connections are evaluated through OPA policy. #[allow(clippy::too_many_arguments)] - pub(crate) async fn start_with_bind_addr( + pub async fn start_with_bind_addr( policy: &ProxyPolicy, bind_addr: Option, opa_engine: Arc, @@ -204,7 +204,7 @@ impl ProxyHandle { let listener = TcpListener::bind(http_addr).await.into_diagnostic()?; let local_addr = listener.local_addr().into_diagnostic()?; { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Listen) .severity(SeverityId::Informational) .status(StatusId::Success) @@ -256,7 +256,7 @@ impl ProxyHandle { ) .await { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -267,7 +267,7 @@ impl ProxyHandle { }); } Err(err) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -435,7 +435,7 @@ async fn handle_tcp_connection( ) .await?; if let InferenceOutcome::Denied { reason } = outcome { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -513,7 +513,7 @@ async fn handle_tcp_connection( // Allowed connections are logged after the L7 config check (below) // so we can distinguish CONNECT (L4-only) from CONNECT_L7 (L7 follows). if matches!(decision.action, NetworkAction::Deny { .. }) { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -587,7 +587,7 @@ async fn handle_tcp_connection( .into_diagnostic()?, Err(reason) => { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -642,7 +642,7 @@ async fn handle_tcp_connection( .into_diagnostic()?, Err(reason) => { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -689,7 +689,7 @@ async fn handle_tcp_connection( } Err(reason) => { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -742,7 +742,7 @@ async fn handle_tcp_connection( .into_diagnostic()?, Err(reason) => { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -794,7 +794,7 @@ async fn handle_tcp_connection( .into_diagnostic()?, Err(reason) => { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -860,7 +860,7 @@ async fn handle_tcp_connection( "CONNECT" }; { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Allowed) .disposition(DispositionId::Allowed) @@ -1002,7 +1002,7 @@ async fn handle_tcp_connection( "TLS connection closed" ); } else { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -1014,7 +1014,7 @@ async fn handle_tcp_connection( } } else { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -1067,7 +1067,7 @@ async fn handle_tcp_connection( if is_benign_relay_error(&e) { debug!(host = %host_lc, port = port, error = %e, "L7 connection closed"); } else { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -1100,7 +1100,7 @@ async fn handle_tcp_connection( if is_benign_relay_error(&e) { debug!(host = %host_lc, port = port, error = %e, "HTTP relay closed"); } else { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -1546,7 +1546,7 @@ async fn process_inference_keepalive { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Refuse) .action(ActionId::Denied) .disposition(DispositionId::Rejected) @@ -1585,7 +1585,7 @@ async fn route_inference_request( detect_inference_pattern(&request.method, &normalized_path, &ctx.patterns) { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Allowed) .disposition(DispositionId::Detected) @@ -1673,7 +1673,7 @@ async fn route_inference_request( } Ok(Ok(None)) => break, Ok(Err(e)) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Medium) .status(StatusId::Failure) @@ -1689,7 +1689,7 @@ async fn route_inference_request( break; } Err(_) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Medium) .status(StatusId::Failure) @@ -1713,7 +1713,7 @@ async fn route_inference_request( } Err(e) => { { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -1739,7 +1739,7 @@ async fn route_inference_request( } else { // Not an inference request — deny { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -1832,7 +1832,7 @@ struct L7RouteSnapshot { } fn emit_l7_tunnel_close_after_policy_change(host: &str, port: u16, error: miette::Report) { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -1884,7 +1884,7 @@ fn query_l7_route_snapshot( generation, }), Err(e) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -2425,7 +2425,7 @@ fn parse_allowed_ips(raw: &[String]) -> std::result::Result, S } if n.prefix_len() < MIN_SAFE_PREFIX_LEN { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .severity(SeverityId::Medium) .message(format!( @@ -2477,7 +2477,7 @@ fn query_allowed_ips( match engine.query_allowed_ips(&input) { Ok(ips) => ips, Err(e) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -2519,7 +2519,7 @@ fn query_exact_declared_endpoint_host( match engine.query_exact_declared_endpoint_host(&input) { Ok(is_exact_declared) => is_exact_declared, Err(e) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -2663,14 +2663,16 @@ fn rewrite_forward_request( path: &str, secret_resolver: Option<&SecretResolver>, request_body_credential_rewrite: bool, -) -> Result, crate::secrets::UnresolvedPlaceholderError> { +) -> Result, openshell_core::secrets::UnresolvedPlaceholderError> { let header_end = raw[..used] .windows(4) .position(|w| w == b"\r\n\r\n") .map_or(used, |p| p + 4); let websocket_upgrade = crate::l7::rest::request_is_websocket_upgrade(&raw[..header_end]); let upstream_path = match secret_resolver { - Some(resolver) => crate::secrets::rewrite_target_for_eval(path, resolver)?.resolved, + Some(resolver) => { + openshell_core::secrets::rewrite_target_for_eval(path, resolver)?.resolved + } None => path.to_string(), }; @@ -2763,10 +2765,10 @@ fn rewrite_forward_request( output.len() }; let output_str = String::from_utf8_lossy(&output[..scan_end]); - if output_str.contains(crate::secrets::PLACEHOLDER_PREFIX_PUBLIC) - || output_str.contains(crate::secrets::PROVIDER_ALIAS_MARKER_PUBLIC) + if output_str.contains(openshell_core::secrets::PLACEHOLDER_PREFIX_PUBLIC) + || output_str.contains(openshell_core::secrets::PROVIDER_ALIAS_MARKER_PUBLIC) { - return Err(crate::secrets::UnresolvedPlaceholderError { location: "header" }); + return Err(openshell_core::secrets::UnresolvedPlaceholderError { location: "header" }); } } @@ -2851,7 +2853,7 @@ async fn handle_forward_proxy( let (scheme, host, port, mut path) = match parse_proxy_uri(target_uri) { Ok(parsed) => parsed, Err(e) => { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -2899,7 +2901,7 @@ async fn handle_forward_proxy( // 2. Reject HTTPS — must use CONNECT for TLS if scheme == "https" { { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Refuse) .action(ActionId::Denied) .disposition(DispositionId::Rejected) @@ -2975,7 +2977,7 @@ async fn handle_forward_proxy( NetworkAction::Allow { matched_policy } => matched_policy.clone(), NetworkAction::Deny { reason } => { { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -3147,7 +3149,7 @@ async fn handle_forward_proxy( params } Err(e) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Medium) .status(StatusId::Failure) @@ -3215,7 +3217,7 @@ async fn handle_forward_proxy( { Ok(info) => info, Err(e) => { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Medium) .status(StatusId::Failure) @@ -3258,7 +3260,7 @@ async fn handle_forward_proxy( || { crate::l7::relay::evaluate_l7_request(&tunnel_engine, &l7_ctx, &request_info) .unwrap_or_else(|e| { - let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -3303,7 +3305,7 @@ async fn handle_forward_proxy( } else { "FORWARD_L7" }; - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(action_id) .disposition(disposition_id) @@ -3382,7 +3384,7 @@ async fn handle_forward_proxy( Ok(addrs) => addrs, Err(reason) => { { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -3438,7 +3440,7 @@ async fn handle_forward_proxy( Ok(addrs) => addrs, Err(reason) => { { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -3489,7 +3491,7 @@ async fn handle_forward_proxy( } Err(reason) => { { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -3545,7 +3547,7 @@ async fn handle_forward_proxy( Ok(addrs) => addrs, Err(reason) => { { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -3599,7 +3601,7 @@ async fn handle_forward_proxy( Ok(addrs) => addrs, Err(reason) => { { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -3666,7 +3668,7 @@ async fn handle_forward_proxy( let mut upstream = match TcpStream::connect(addrs.as_slice()).await { Ok(s) => s, Err(e) => { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -3701,7 +3703,7 @@ async fn handle_forward_proxy( // Log success { - let event = HttpActivityBuilder::new(crate::ocsf_ctx()) + let event = HttpActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Other) .action(ActionId::Allowed) .disposition(DispositionId::Allowed) diff --git a/crates/openshell-supervisor-network/src/run.rs b/crates/openshell-supervisor-network/src/run.rs new file mode 100644 index 000000000..962f975bc --- /dev/null +++ b/crates/openshell-supervisor-network/src/run.rs @@ -0,0 +1,491 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Networking stack startup for the sandbox. +//! +//! Builds the network namespace (Linux), the CONNECT proxy with TLS L7 +//! interception, the bypass monitor, the inference context, and the +//! denial-event channel. Returns a [`Networking`] handle whose RAII fields +//! keep the proxy and bypass-monitor tasks alive for the lifetime of the +//! sandbox supervisor. + +use std::net::SocketAddr; +use std::sync::Arc; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::time::Duration; + +use miette::Result; +use tracing::{debug, info, warn}; + +#[cfg(target_os = "linux")] +use openshell_core::netns::NetworkNamespace; +use openshell_core::policy::{NetworkMode, SandboxPolicy}; +use openshell_core::proto::SandboxPolicy as ProtoSandboxPolicy; +use openshell_core::provider_credentials::ProviderCredentialState; +use openshell_ocsf::{ + ConfigStateChangeBuilder, SeverityId, StateId, StatusId, ctx::ctx as ocsf_ctx, ocsf_emit, +}; + +use crate::denial_aggregator::{DenialAggregator, FlushableDenialSummary}; +use crate::identity::BinaryIdentityCache; +use crate::l7::tls::{ + CertCache, ProxyTlsState, SandboxCa, build_upstream_client_config, read_system_ca_bundle, + write_ca_files, +}; +use crate::mechanistic_mapper; +use crate::opa::OpaEngine; +use crate::policy_local::PolicyLocalContext; +use crate::proxy::ProxyHandle; + +/// Create the workload's network namespace and install bypass detection +/// rules. Returns `None` when the policy is not in proxy mode. Linux-only. +/// +/// The namespace is shared infrastructure: the proxy binds to its host-side +/// veth IP and reads /dev/kmsg from inside it for bypass detection, while +/// the workload child and SSH sessions enter it via `setns()`. +/// +/// # Errors +/// +/// Returns an error if proxy mode is requested but the namespace cannot be +/// created (e.g., missing `CAP_NET_ADMIN` / `CAP_SYS_ADMIN` or `iproute2`). +/// Failure to install nftables bypass-detection rules is non-fatal and is +/// reported via OCSF instead. +#[cfg(target_os = "linux")] +pub fn create_netns_for_proxy(policy: &SandboxPolicy) -> Result> { + if !matches!(policy.network.mode, NetworkMode::Proxy) { + return Ok(None); + } + match NetworkNamespace::create() { + Ok(ns) => { + // Install bypass detection rules (nftables log + reject). + // This provides fast-fail UX and diagnostic logging for direct + // connection attempts that bypass the HTTP CONNECT proxy. + let proxy_port = policy + .network + .proxy + .as_ref() + .and_then(|p| p.http_addr) + .map_or(3128, |addr| addr.port()); + if let Err(e) = ns.install_bypass_rules(proxy_port) { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .state(StateId::Disabled, "degraded") + .message(format!( + "Failed to install bypass detection rules (non-fatal): {e}" + )) + .build() + ); + } + Ok(Some(ns)) + } + Err(e) => Err(miette::miette!( + "Network namespace creation failed and proxy mode requires isolation. \ + Ensure CAP_NET_ADMIN and CAP_SYS_ADMIN are available and iproute2 is installed. \ + Error: {e}" + )), + } +} + +/// Handles and values produced by [`run_networking`] that the rest of +/// `run_sandbox` consumes. +/// +/// The `proxy` / `bypass_monitor` fields are RAII handles whose drop +/// tears down the proxy and bypass-monitor tasks. They must remain alive for +/// the duration of the sandbox wait loop, which is achieved by holding the +/// returned `Networking` value in `run_sandbox`'s frame. +pub struct Networking { + pub proxy: Option, + #[cfg(target_os = "linux")] + pub bypass_monitor: Option>, + + pub ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, + pub ssh_proxy_url: Option, + pub ssh_netns_fd: Option, + /// Policy-local route context: shared with the orchestrator's policy poll + /// loop so it can publish updated `SandboxPolicy` snapshots that the + /// `policy.local` route handler returns to the workload. + pub policy_local_ctx: Arc, +} + +/// Set up the networking stack: ephemeral CA + TLS state, proxy server, +/// bypass monitor, and the SSH-side proxy URL / netns FD. +/// +/// The network namespace is created by `run_sandbox` and borrowed in here — +/// it is shared infrastructure used by both the proxy (bind address, bypass +/// monitor) and the workload child (entered via `setns()` in `pre_exec`). +/// +/// # Errors +/// +/// Returns an error if proxy mode is requested but the proxy configuration, +/// OPA engine, or identity cache is missing, if inference route resolution +/// fails, or if the proxy server fails to start. +#[allow(clippy::too_many_arguments)] +pub async fn run_networking( + policy: &SandboxPolicy, + #[cfg(target_os = "linux")] netns: Option<&NetworkNamespace>, + opa_engine: Option<&Arc>, + retained_proto: Option<&ProtoSandboxPolicy>, + entrypoint_pid: Arc, + provider_credentials: &ProviderCredentialState, + sandbox_id: Option<&str>, + sandbox_name: Option<&str>, + openshell_endpoint: Option<&str>, + inference_routes: Option<&str>, +) -> Result { + // Build the policy-local route context. The orchestrator's policy poll + // loop also holds an `Arc` clone (via `Networking::policy_local_ctx`) so + // it can publish updated policy snapshots after a successful reload. + let policy_local_ctx = Arc::new(PolicyLocalContext::new( + retained_proto.cloned(), + openshell_endpoint.map(str::to_string), + sandbox_name + .map(str::to_string) + .or_else(|| sandbox_id.map(str::to_string)), + )); + + // Spawn a task to resolve policy binary symlinks once the workload's mount + // namespace becomes accessible via /proc//root/. Reads entrypoint_pid + // lazily, so spawning before run_process sets the PID is safe — the probe + // loop just waits. + if let (Some(engine), Some(proto)) = (opa_engine, retained_proto) { + let resolve_engine = engine.clone(); + let resolve_proto = proto.clone(); + let resolve_pid = entrypoint_pid.clone(); + tokio::spawn(async move { + let pid = resolve_pid.load(Ordering::Acquire); + let probe_path = format!("/proc/{pid}/root/"); + // Retry up to 10 times with 500ms intervals (5s total). + // The child's mount namespace is typically ready within a + // few hundred ms of spawn. + for attempt in 1..=10 { + tokio::time::sleep(Duration::from_millis(500)).await; + if std::fs::metadata(&probe_path).is_ok() { + info!( + pid = pid, + attempt = attempt, + "Container filesystem accessible, resolving policy binary symlinks" + ); + match resolve_engine.reload_from_proto_with_pid(&resolve_proto, pid) { + Ok(()) => { + info!( + pid = pid, + "Policy binary symlink resolution complete \ + (check logs above for per-binary results)" + ); + } + Err(e) => { + warn!( + "Failed to rebuild OPA engine with symlink resolution \ + (non-fatal, falling back to literal path matching): {e}" + ); + } + } + return; + } + debug!( + pid = pid, + attempt = attempt, + probe_path = %probe_path, + "Container filesystem not yet accessible, retrying symlink resolution" + ); + } + warn!( + "Container filesystem /proc/{pid}/root/ not accessible after 10 attempts (5s); \ + binary symlink resolution skipped. Policy binary paths will be matched literally. \ + If binaries are symlinks, use canonical paths in your policy \ + (run 'readlink -f ' inside the sandbox)" + ); + }); + } + + // Identity cache for SHA256 TOFU when OPA is active. Only consumed by + // the proxy, so it's owned here. + let identity_cache = opa_engine.map(|_| Arc::new(BinaryIdentityCache::new())); + + // Generate ephemeral CA and TLS state for HTTPS L7 inspection. + // The CA cert is written to disk so sandbox processes can trust it. + let (tls_state, ca_file_paths) = if matches!(policy.network.mode, NetworkMode::Proxy) { + match SandboxCa::generate() { + Ok(ca) => { + let tls_dir = std::path::Path::new("/etc/openshell-tls"); + let system_ca_bundle = read_system_ca_bundle(); + match write_ca_files(&ca, tls_dir, &system_ca_bundle) { + Ok(paths) => { + // /etc/openshell-tls is subsumed by the /etc baseline + // path injected by enrich_*_baseline_paths(), so no + // explicit Landlock entry is needed here. + + let upstream_config = build_upstream_client_config(&system_ca_bundle); + let cert_cache = CertCache::new(ca); + let state = Arc::new(ProxyTlsState::new(cert_cache, upstream_config)); + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Enabled, "enabled") + .message("TLS termination enabled: ephemeral CA generated") + .build() + ); + (Some(state), Some(paths)) + } + Err(e) => { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .state(StateId::Disabled, "disabled") + .message(format!( + "Failed to write CA files, TLS termination disabled: {e}" + )) + .build() + ); + (None, None) + } + } + } + Err(e) => { + ocsf_emit!( + ConfigStateChangeBuilder::new(ocsf_ctx()) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .state(StateId::Disabled, "disabled") + .message(format!( + "Failed to generate ephemeral CA, TLS termination disabled: {e}" + )) + .build() + ); + (None, None) + } + } + } else { + (None, None) + }; + + let (proxy_handle, denial_rx, bypass_denial_tx) = + if matches!(policy.network.mode, NetworkMode::Proxy) { + let proxy_policy = policy.network.proxy.as_ref().ok_or_else(|| { + miette::miette!( + "Network mode is set to proxy but no proxy configuration was provided" + ) + })?; + + let engine = opa_engine.cloned().ok_or_else(|| { + miette::miette!("Proxy mode requires an OPA engine (--rego-policy and --rego-data)") + })?; + + let cache = identity_cache.clone().ok_or_else(|| { + miette::miette!( + "Proxy mode requires an identity cache (OPA engine must be configured)" + ) + })?; + + // If we have a network namespace, bind to the veth host IP so sandboxed + // processes can reach the proxy via TCP. + #[cfg(target_os = "linux")] + let bind_addr = netns.map(|ns| { + let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); + SocketAddr::new(ns.host_ip(), port) + }); + + #[cfg(not(target_os = "linux"))] + let bind_addr: Option = None; + + // Build inference context for local routing of intercepted inference calls. + let inference_ctx = crate::inference_routes::build_inference_context( + sandbox_id, + openshell_endpoint, + inference_routes, + ) + .await?; + + // Create denial aggregator channel if in gRPC mode (sandbox_id present). + // Clone the sender for the bypass monitor before passing to the proxy. + let (denial_tx, denial_rx, bypass_denial_tx) = if sandbox_id.is_some() { + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + let bypass_tx = tx.clone(); + (Some(tx), Some(rx), Some(bypass_tx)) + } else { + (None, None, None) + }; + + let proxy_handle = ProxyHandle::start_with_bind_addr( + proxy_policy, + bind_addr, + engine, + cache, + entrypoint_pid.clone(), + tls_state, + inference_ctx, + Some(provider_credentials.clone()), + Some(policy_local_ctx.clone()), + denial_tx, + ) + .await?; + (Some(proxy_handle), denial_rx, bypass_denial_tx) + } else { + (None, None, None) + }; + + // Spawn the denial-aggregator flush task. The aggregator drains denial + // events from the proxy + bypass monitor, batches them, and ships + // summaries to the gateway via SubmitPolicyAnalysis. + if let (Some(rx), Some(endpoint)) = (denial_rx, openshell_endpoint) { + // SubmitPolicyAnalysis resolves by sandbox *name*, not UUID — fall back + // to the ID when the name isn't set. + let agg_name = sandbox_name + .map(str::to_string) + .or_else(|| sandbox_id.map(str::to_string)) + .unwrap_or_default(); + let agg_endpoint = endpoint.to_string(); + let flush_interval_secs: u64 = std::env::var("OPENSHELL_DENIAL_FLUSH_INTERVAL_SECS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(10); + + let aggregator = DenialAggregator::new(rx, flush_interval_secs); + + tokio::spawn(async move { + aggregator + .run(|summaries| { + let endpoint = agg_endpoint.clone(); + let sandbox_name = agg_name.clone(); + async move { + if let Err(e) = + flush_proposals_to_gateway(&endpoint, &sandbox_name, summaries).await + { + warn!(error = %e, "Failed to flush denial summaries to gateway"); + } + } + }) + .await; + }); + } + + // Spawn bypass detection monitor (Linux only, proxy mode only). + // Reads /dev/kmsg for nftables log entries and emits structured + // tracing events for direct connection attempts that bypass the proxy. + #[cfg(target_os = "linux")] + let bypass_monitor_handle = netns.and_then(|ns| { + crate::bypass_monitor::spawn( + ns.name().to_string(), + entrypoint_pid.clone(), + bypass_denial_tx, + ) + }); + + // On non-Linux, bypass_denial_tx is unused (no /dev/kmsg). + #[cfg(not(target_os = "linux"))] + drop(bypass_denial_tx); + + // Compute the proxy URL and netns fd for SSH sessions. + // SSH shell processes need both to enforce network policy: + // - netns_fd: enter the network namespace via setns() so all traffic + // goes through the veth pair (hard enforcement, non-bypassable) + // - proxy_url: set proxy env vars so cooperative tools route through the + // CONNECT proxy; this also opts Node.js into honoring those vars + #[cfg(target_os = "linux")] + let ssh_netns_fd = netns.and_then(NetworkNamespace::ns_fd); + + #[cfg(not(target_os = "linux"))] + let ssh_netns_fd: Option = None; + + let ssh_proxy_url = if matches!(policy.network.mode, NetworkMode::Proxy) { + #[cfg(target_os = "linux")] + { + netns.map(|ns| { + let port = policy + .network + .proxy + .as_ref() + .and_then(|p| p.http_addr) + .map_or(3128, |addr| addr.port()); + format!("http://{}:{port}", ns.host_ip()) + }) + } + #[cfg(not(target_os = "linux"))] + { + policy + .network + .proxy + .as_ref() + .and_then(|p| p.http_addr) + .map(|addr| format!("http://{addr}")) + } + } else { + None + }; + + Ok(Networking { + proxy: proxy_handle, + #[cfg(target_os = "linux")] + bypass_monitor: bypass_monitor_handle, + ca_file_paths, + ssh_proxy_url, + ssh_netns_fd, + policy_local_ctx, + }) +} + +/// Flush aggregated denial summaries to the gateway via `SubmitPolicyAnalysis`. +async fn flush_proposals_to_gateway( + endpoint: &str, + sandbox_name: &str, + summaries: Vec, +) -> Result<()> { + use openshell_core::grpc_client::CachedOpenShellClient; + use openshell_core::proto::{DenialSummary, L7RequestSample}; + + let client = CachedOpenShellClient::connect(endpoint).await?; + + let proto_summaries: Vec = summaries + .into_iter() + .map(|s| DenialSummary { + sandbox_id: String::new(), + host: s.host, + port: u32::from(s.port), + binary: s.binary, + ancestors: s.ancestors, + deny_reason: s.deny_reason, + first_seen_ms: s.first_seen_ms, + last_seen_ms: s.last_seen_ms, + count: s.count, + suppressed_count: 0, + total_count: s.count, + sample_cmdlines: s.sample_cmdlines, + binary_sha256: String::new(), + persistent: false, + denial_stage: s.denial_stage, + l7_request_samples: s + .l7_samples + .into_iter() + .map(|l| L7RequestSample { + method: l.method, + path: l.path, + decision: "deny".to_string(), + count: l.count, + }) + .collect(), + l7_inspection_active: false, + }) + .collect(); + + // Run the mechanistic mapper sandbox-side to generate proposals. + // The gateway is a thin persistence + validation layer — it never + // generates proposals itself. + let proposals = mechanistic_mapper::generate_proposals(&proto_summaries); + + info!( + sandbox_name = %sandbox_name, + summaries = proto_summaries.len(), + proposals = proposals.len(), + "Flushed denial analysis to gateway" + ); + + client + .submit_policy_analysis(sandbox_name, proto_summaries, proposals, "mechanistic") + .await?; + + Ok(()) +} diff --git a/crates/openshell-sandbox/testdata/sandbox-policy.yaml b/crates/openshell-supervisor-network/testdata/sandbox-policy.yaml similarity index 100% rename from crates/openshell-sandbox/testdata/sandbox-policy.yaml rename to crates/openshell-supervisor-network/testdata/sandbox-policy.yaml diff --git a/crates/openshell-sandbox/tests/system_inference.rs b/crates/openshell-supervisor-network/tests/system_inference.rs similarity index 94% rename from crates/openshell-sandbox/tests/system_inference.rs rename to crates/openshell-supervisor-network/tests/system_inference.rs index 20c39f3b6..ef1b5f54d 100644 --- a/crates/openshell-sandbox/tests/system_inference.rs +++ b/crates/openshell-supervisor-network/tests/system_inference.rs @@ -9,7 +9,7 @@ use openshell_router::Router; use openshell_router::config::{AuthHeader, ResolvedRoute}; -use openshell_sandbox::proxy::InferenceContext; +use openshell_supervisor_network::proxy::InferenceContext; fn make_system_route() -> ResolvedRoute { ResolvedRoute { @@ -42,7 +42,7 @@ fn make_user_route() -> ResolvedRoute { #[tokio::test] async fn system_inference_routes_to_mock_backend() { let router = Router::new().unwrap(); - let patterns = openshell_sandbox::l7::inference::default_patterns(); + let patterns = openshell_supervisor_network::l7::inference::default_patterns(); let ctx = InferenceContext::new( patterns, @@ -86,7 +86,7 @@ async fn system_inference_routes_to_mock_backend() { #[tokio::test] async fn system_inference_uses_system_routes_not_user_routes() { let router = Router::new().unwrap(); - let patterns = openshell_sandbox::l7::inference::default_patterns(); + let patterns = openshell_supervisor_network::l7::inference::default_patterns(); // Only user routes configured — no system routes let ctx = InferenceContext::new(patterns, router, vec![make_user_route()], vec![]); @@ -118,7 +118,7 @@ async fn system_inference_uses_system_routes_not_user_routes() { #[tokio::test] async fn system_inference_with_anthropic_protocol() { let router = Router::new().unwrap(); - let patterns = openshell_sandbox::l7::inference::default_patterns(); + let patterns = openshell_supervisor_network::l7::inference::default_patterns(); let system_route = ResolvedRoute { name: "sandbox-system".to_string(), diff --git a/crates/openshell-sandbox/tests/websocket_upgrade.rs b/crates/openshell-supervisor-network/tests/websocket_upgrade.rs similarity index 98% rename from crates/openshell-sandbox/tests/websocket_upgrade.rs rename to crates/openshell-supervisor-network/tests/websocket_upgrade.rs index b35076a9a..322d6709c 100644 --- a/crates/openshell-sandbox/tests/websocket_upgrade.rs +++ b/crates/openshell-supervisor-network/tests/websocket_upgrade.rs @@ -26,8 +26,8 @@ use futures::SinkExt; use futures::stream::StreamExt; -use openshell_sandbox::l7::provider::{BodyLength, L7Provider, L7Request, RelayOutcome}; -use openshell_sandbox::l7::rest::RestProvider; +use openshell_supervisor_network::l7::provider::{BodyLength, L7Provider, L7Request, RelayOutcome}; +use openshell_supervisor_network::l7::rest::RestProvider; use std::collections::HashMap; use std::net::SocketAddr; use tokio::io::{AsyncReadExt, AsyncWriteExt}; diff --git a/crates/openshell-supervisor-process/Cargo.toml b/crates/openshell-supervisor-process/Cargo.toml new file mode 100644 index 000000000..9b8f50b2a --- /dev/null +++ b/crates/openshell-supervisor-process/Cargo.toml @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[package] +name = "openshell-supervisor-process" +description = "Process component of the OpenShell supervisor: entrypoint spawn, SSH server, supervisor session, netns, bypass monitor" +version.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true + +[dependencies] +openshell-core = { path = "../openshell-core" } +openshell-ocsf = { path = "../openshell-ocsf" } + +anyhow = { workspace = true } +base64 = { workspace = true } +hex = "0.4" +miette = { workspace = true } +nix = { workspace = true } +rand_core = "0.6" +russh = "0.57" +serde_json = { workspace = true } +sha2 = { workspace = true } +tokio = { workspace = true } +tokio-stream = { workspace = true } +tonic = { workspace = true, features = ["channel", "tls"] } +tracing = { workspace = true } +tracing-subscriber = { workspace = true } +uuid = { workspace = true } + +[target.'cfg(unix)'.dependencies] +libc = "0.2" +rustix = { workspace = true } + +[target.'cfg(target_os = "linux")'.dependencies] +landlock = "0.4" +seccompiler = "0.5" + +[dev-dependencies] +tempfile = "3" + +[lints] +workspace = true diff --git a/crates/openshell-sandbox/src/child_env.rs b/crates/openshell-supervisor-process/src/child_env.rs similarity index 100% rename from crates/openshell-sandbox/src/child_env.rs rename to crates/openshell-supervisor-process/src/child_env.rs diff --git a/crates/openshell-sandbox/src/debug_rpc.rs b/crates/openshell-supervisor-process/src/debug_rpc.rs similarity index 99% rename from crates/openshell-sandbox/src/debug_rpc.rs rename to crates/openshell-supervisor-process/src/debug_rpc.rs index af22b7450..f583d54dc 100644 --- a/crates/openshell-sandbox/src/debug_rpc.rs +++ b/crates/openshell-supervisor-process/src/debug_rpc.rs @@ -24,7 +24,7 @@ use openshell_core::proto::{ }; use sha2::{Digest, Sha256}; -use crate::grpc_client::{AuthedChannel, connect_channel_pub}; +use openshell_core::grpc_client::{AuthedChannel, connect_channel_pub}; /// Entry point for the `debug-rpc` subcommand. Returns the process exit /// code; `main` propagates it. diff --git a/crates/openshell-supervisor-process/src/lib.rs b/crates/openshell-supervisor-process/src/lib.rs new file mode 100644 index 000000000..d7401f8a1 --- /dev/null +++ b/crates/openshell-supervisor-process/src/lib.rs @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Process component of the `OpenShell` supervisor. +//! +//! Owns the entrypoint process spawn, SSH server, supervisor session, network +//! namespace, bypass monitor, child environment construction, skills install, +//! and log push. Populated by follow-up commits as modules migrate out of +//! `openshell-sandbox`. + +pub mod child_env; +pub mod debug_rpc; +pub mod log_push; +pub mod managed_children; +pub mod process; +pub mod run; +pub mod sandbox; +pub mod skills; +pub mod ssh; +pub mod supervisor_session; diff --git a/crates/openshell-sandbox/src/log_push.rs b/crates/openshell-supervisor-process/src/log_push.rs similarity index 99% rename from crates/openshell-sandbox/src/log_push.rs rename to crates/openshell-supervisor-process/src/log_push.rs index fd33d1e07..f65f30433 100644 --- a/crates/openshell-sandbox/src/log_push.rs +++ b/crates/openshell-supervisor-process/src/log_push.rs @@ -7,7 +7,7 @@ //! channel to a background task. The task batches lines and streams them to //! the server using the `PushSandboxLogs` client-streaming RPC. -use crate::grpc_client::CachedOpenShellClient; +use openshell_core::grpc_client::CachedOpenShellClient; use openshell_core::proto::{PushSandboxLogsRequest, SandboxLogLine}; use tokio::sync::mpsc; use tracing::{Event, Subscriber}; diff --git a/crates/openshell-supervisor-process/src/managed_children.rs b/crates/openshell-supervisor-process/src/managed_children.rs new file mode 100644 index 000000000..311c80693 --- /dev/null +++ b/crates/openshell-supervisor-process/src/managed_children.rs @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Process-wide tracker for sandbox-managed child PIDs. +//! +//! The supervisor spawns several long-lived children (the entrypoint, SSH +//! sessions). Each registers its PID here on spawn and removes it on exit so +//! the orchestrator's `SIGCHLD` reaper can distinguish supervised processes +//! from incidental zombies. + +#![cfg(target_os = "linux")] + +use std::collections::HashSet; +use std::sync::{LazyLock, Mutex}; + +static MANAGED_CHILDREN: LazyLock>> = + LazyLock::new(|| Mutex::new(HashSet::new())); + +/// Add `pid` to the supervised-child set. Non-positive or out-of-range values +/// are silently ignored. +pub fn register(pid: u32) { + let Ok(pid) = i32::try_from(pid) else { + return; + }; + if pid <= 0 { + return; + } + if let Ok(mut children) = MANAGED_CHILDREN.lock() { + children.insert(pid); + } +} + +/// Remove `pid` from the supervised-child set. Non-positive or out-of-range +/// values are silently ignored. +pub fn unregister(pid: u32) { + let Ok(pid) = i32::try_from(pid) else { + return; + }; + if pid <= 0 { + return; + } + if let Ok(mut children) = MANAGED_CHILDREN.lock() { + children.remove(&pid); + } +} + +/// Return `true` if `pid` is currently in the supervised-child set. +#[must_use] +pub fn is_managed(pid: i32) -> bool { + MANAGED_CHILDREN + .lock() + .is_ok_and(|children| children.contains(&pid)) +} diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-supervisor-process/src/process.rs similarity index 79% rename from crates/openshell-sandbox/src/process.rs rename to crates/openshell-supervisor-process/src/process.rs index 76786a84d..cfd9e7b6e 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-supervisor-process/src/process.rs @@ -4,15 +4,15 @@ //! Process management and signal handling. use crate::child_env; -use crate::policy::{NetworkMode, SandboxPolicy}; -use crate::sandbox; -#[cfg(target_os = "linux")] -use crate::sandbox::linux::netns::NetworkNamespace; #[cfg(target_os = "linux")] -use crate::{register_managed_child, unregister_managed_child}; +use crate::managed_children; +use crate::sandbox; use miette::{IntoDiagnostic, Result}; use nix::sys::signal::{self, Signal}; use nix::unistd::{Group, Pid, User}; +#[cfg(target_os = "linux")] +use openshell_core::netns::NetworkNamespace; +use openshell_core::policy::{NetworkMode, SandboxPolicy}; use std::collections::HashMap; use std::ffi::CString; #[cfg(target_os = "linux")] @@ -321,7 +321,7 @@ impl ProcessHandle { let child = cmd.spawn().into_diagnostic()?; let pid = child.id().unwrap_or(0); - register_managed_child(pid); + managed_children::register(pid); debug!(pid, program, "Process spawned"); @@ -418,7 +418,7 @@ impl ProcessHandle { let child = cmd.spawn().into_diagnostic()?; let pid = child.id().unwrap_or(0); #[cfg(target_os = "linux")] - register_managed_child(pid); + managed_children::register(pid); debug!(pid, program, "Process spawned"); @@ -439,7 +439,7 @@ impl ProcessHandle { pub async fn wait(&mut self) -> std::io::Result { let status = self.child.wait().await; #[cfg(target_os = "linux")] - unregister_managed_child(self.pid); + managed_children::unregister(self.pid); let status = status?; Ok(ProcessStatus::from(status)) } @@ -463,7 +463,7 @@ impl ProcessHandle { // First try SIGTERM if let Err(e) = self.signal(Signal::SIGTERM) { openshell_ocsf::ocsf_emit!( - openshell_ocsf::ProcessActivityBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ProcessActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(openshell_ocsf::ActivityId::Close) .severity(openshell_ocsf::SeverityId::Medium) .status(openshell_ocsf::StatusId::Failure) @@ -489,8 +489,145 @@ impl ProcessHandle { impl Drop for ProcessHandle { fn drop(&mut self) { #[cfg(target_os = "linux")] - unregister_managed_child(self.pid); + managed_children::unregister(self.pid); + } +} + +/// Validate that the `sandbox` user exists in this image. +/// +/// All sandbox images must include a `sandbox` user for privilege dropping. +/// This check runs at supervisor startup (inside the container) where we can +/// inspect `/etc/passwd`. If the user is missing, the sandbox fails fast +/// with a clear error instead of silently running child processes as root. +#[cfg(unix)] +pub fn validate_sandbox_user(policy: &SandboxPolicy) -> Result<()> { + let user_name = policy.process.run_as_user.as_deref().unwrap_or("sandbox"); + + if user_name.is_empty() || user_name == "sandbox" { + match User::from_name("sandbox") { + Ok(Some(_)) => { + openshell_ocsf::ocsf_emit!( + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) + .severity(openshell_ocsf::SeverityId::Informational) + .status(openshell_ocsf::StatusId::Success) + .state(openshell_ocsf::StateId::Enabled, "validated") + .message("Validated 'sandbox' user exists in image") + .build() + ); + } + Ok(None) => { + return Err(miette::miette!( + "sandbox user 'sandbox' not found in image; \ + all sandbox images must include a 'sandbox' user and group" + )); + } + Err(e) => { + return Err(miette::miette!("failed to look up 'sandbox' user: {e}")); + } + } + } + + Ok(()) +} + +/// Prepare a `read_write` path for the sandboxed process. +/// +/// Returns `true` when the path was created by the supervisor and therefore +/// still needs to be chowned to the sandbox user/group. Existing paths keep +/// their image-defined ownership. +#[cfg(unix)] +fn prepare_read_write_path(path: &std::path::Path) -> Result { + // SECURITY: use symlink_metadata (lstat) to inspect each path *before* + // calling chown. chown follows symlinks, so a malicious container image + // could place a symlink (e.g. /sandbox -> /etc/shadow) to trick the + // root supervisor into transferring ownership of arbitrary files. + // The TOCTOU window between lstat and chown is not exploitable because + // no untrusted process is running yet (the child has not been forked). + if let Ok(meta) = std::fs::symlink_metadata(path) { + if meta.file_type().is_symlink() { + return Err(miette::miette!( + "read_write path '{}' is a symlink — refusing to chown (potential privilege escalation)", + path.display() + )); + } + + debug!( + path = %path.display(), + "Preserving ownership for existing read_write path" + ); + Ok(false) + } else { + debug!(path = %path.display(), "Creating read_write directory"); + std::fs::create_dir_all(path).into_diagnostic()?; + Ok(true) + } +} + +/// Prepare filesystem for the sandboxed process. +/// +/// Creates `read_write` directories if they don't exist and sets ownership +/// on newly-created paths to the configured sandbox user/group. This runs as +/// the supervisor (root) before forking the child process. +#[cfg(unix)] +pub fn prepare_filesystem(policy: &SandboxPolicy) -> Result<()> { + use nix::unistd::chown; + + let user_name = match policy.process.run_as_user.as_deref() { + Some(name) if !name.is_empty() => Some(name), + _ => None, + }; + let group_name = match policy.process.run_as_group.as_deref() { + Some(name) if !name.is_empty() => Some(name), + _ => None, + }; + + // If no user/group configured, nothing to do + if user_name.is_none() && group_name.is_none() { + return Ok(()); + } + + // Resolve user and group + let uid = if let Some(name) = user_name { + Some( + User::from_name(name) + .into_diagnostic()? + .ok_or_else(|| miette::miette!("Sandbox user not found: {name}"))? + .uid, + ) + } else { + None + }; + + let gid = if let Some(name) = group_name { + Some( + Group::from_name(name) + .into_diagnostic()? + .ok_or_else(|| miette::miette!("Sandbox group not found: {name}"))? + .gid, + ) + } else { + None + }; + + // Create missing read_write paths and only chown the ones we created. + for path in &policy.filesystem.read_write { + if prepare_read_write_path(path)? { + debug!( + path = %path.display(), + ?uid, + ?gid, + "Setting ownership on newly created read_write path" + ); + chown(path, uid, gid).into_diagnostic()?; + } } + + Ok(()) +} + +#[cfg(not(unix))] +pub fn prepare_filesystem(_policy: &SandboxPolicy) -> Result<()> { + Ok(()) } // `effective_gid`/`effective_uid` are intentionally parallel names (same role @@ -658,13 +795,13 @@ impl From for ProcessStatus { #[cfg(test)] mod tests { use super::*; - use crate::policy::{ - FilesystemPolicy, LandlockPolicy, NetworkPolicy, ProcessPolicy, SandboxPolicy, - }; #[cfg(unix)] use nix::sys::wait::{WaitStatus, waitpid}; #[cfg(unix)] use nix::unistd::{ForkResult, fork}; + use openshell_core::policy::{ + FilesystemPolicy, LandlockPolicy, NetworkPolicy, ProcessPolicy, SandboxPolicy, + }; #[cfg(unix)] use std::mem::size_of; use std::process::Stdio as StdStdio; @@ -935,4 +1072,104 @@ mod tests { let stdout = String::from_utf8(output.stdout).expect("utf8"); assert!(stdout.contains("ANTHROPIC_API_KEY=openshell:resolve:env:ANTHROPIC_API_KEY")); } + + #[cfg(unix)] + fn sandbox_policy_with_read_write( + path: PathBuf, + run_as_user: Option, + run_as_group: Option, + ) -> SandboxPolicy { + SandboxPolicy { + version: 1, + filesystem: FilesystemPolicy { + read_only: vec![], + read_write: vec![path], + include_workdir: false, + }, + network: NetworkPolicy::default(), + landlock: LandlockPolicy::default(), + process: ProcessPolicy { + run_as_user, + run_as_group, + }, + } + } + + #[cfg(unix)] + #[test] + fn prepare_read_write_path_creates_missing_directory() { + let dir = tempfile::tempdir().unwrap(); + let missing = dir.path().join("missing").join("nested"); + + assert!(prepare_read_write_path(&missing).unwrap()); + assert!(missing.is_dir()); + } + + #[cfg(unix)] + #[test] + fn prepare_read_write_path_preserves_existing_directory() { + let dir = tempfile::tempdir().unwrap(); + let existing = dir.path().join("existing"); + std::fs::create_dir(&existing).unwrap(); + + assert!(!prepare_read_write_path(&existing).unwrap()); + assert!(existing.is_dir()); + } + + #[cfg(unix)] + #[test] + fn prepare_read_write_path_rejects_symlink() { + use std::os::unix::fs::symlink; + + let dir = tempfile::tempdir().unwrap(); + let target = dir.path().join("target"); + let link = dir.path().join("link"); + std::fs::create_dir(&target).unwrap(); + symlink(&target, &link).unwrap(); + + let error = prepare_read_write_path(&link).unwrap_err(); + assert!( + error + .to_string() + .contains("is a symlink — refusing to chown"), + "unexpected error: {error}" + ); + } + + #[cfg(unix)] + #[test] + fn prepare_filesystem_skips_chown_for_existing_read_write_paths() { + use std::os::unix::fs::MetadataExt; + + if nix::unistd::geteuid().is_root() { + return; + } + + let current_user = User::from_uid(nix::unistd::geteuid()) + .unwrap() + .expect("current user entry"); + let restricted_group = Group::from_gid(nix::unistd::Gid::from_raw(0)) + .unwrap() + .expect("gid 0 group entry"); + if restricted_group.gid == nix::unistd::getegid() { + return; + } + + let dir = tempfile::tempdir().unwrap(); + let existing = dir.path().join("existing"); + std::fs::create_dir(&existing).unwrap(); + let before = std::fs::metadata(&existing).unwrap(); + + let policy = sandbox_policy_with_read_write( + existing.clone(), + Some(current_user.name), + Some(restricted_group.name), + ); + + prepare_filesystem(&policy).expect("existing path should not be re-owned"); + + let after = std::fs::metadata(&existing).unwrap(); + assert_eq!(after.uid(), before.uid()); + assert_eq!(after.gid(), before.gid()); + } } diff --git a/crates/openshell-supervisor-process/src/run.rs b/crates/openshell-supervisor-process/src/run.rs new file mode 100644 index 000000000..f3e77dc20 --- /dev/null +++ b/crates/openshell-supervisor-process/src/run.rs @@ -0,0 +1,377 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Workload supervision entry point. +//! +//! Spawns the SSH server, optional supervisor session, the entrypoint child +//! process, and waits for it to exit (with optional timeout). Long-running +//! background tasks that aren't strictly tied to the workload's lifetime +//! (policy poll loop, denial aggregator, symlink resolver) live in the +//! orchestrator, not here. + +use miette::{IntoDiagnostic, Result}; +use std::sync::Arc; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::time::Duration; +use tokio::time::timeout; +use tracing::info; + +use openshell_ocsf::{ + ActionId, ActivityId, AppLifecycleBuilder, DispositionId, LaunchTypeId, Process as OcsfProcess, + ProcessActivityBuilder, SeverityId, StatusId, ocsf_emit, +}; + +#[cfg(target_os = "linux")] +use openshell_core::netns::NetworkNamespace; +use openshell_core::policy::SandboxPolicy; +use openshell_core::provider_credentials::ProviderCredentialState; + +#[cfg(target_os = "linux")] +use crate::managed_children; +use crate::process::ProcessHandle; + +fn ocsf_ctx() -> &'static openshell_ocsf::SandboxContext { + openshell_ocsf::ctx::ctx() +} + +/// Spawn the workload entrypoint, wire up SSH and supervisor session, and +/// wait for the entrypoint child to exit. +/// +/// # Errors +/// +/// Returns an error if SSH server startup fails, if the entrypoint child +/// fails to spawn, or if waiting for the child returns an OS error. +#[allow(clippy::too_many_arguments, clippy::implicit_hasher)] +pub async fn run_process( + program: &str, + args: &[String], + workdir: Option<&str>, + timeout_secs: u64, + interactive: bool, + sandbox_id: Option<&str>, + openshell_endpoint: Option<&str>, + ssh_socket_path: Option, + policy: &SandboxPolicy, + entrypoint_pid: Arc, + provider_credentials: ProviderCredentialState, + provider_env: std::collections::HashMap, + ssh_proxy_url: Option, + ssh_netns_fd: Option, + ca_file_paths: Option<(std::path::PathBuf, std::path::PathBuf)>, + #[cfg(target_os = "linux")] netns: Option<&NetworkNamespace>, +) -> Result { + // Validate that the sandbox user exists in the image. All sandbox images + // must include a "sandbox" user for privilege dropping; failing fast here + // beats silently running children as root. + #[cfg(unix)] + crate::process::validate_sandbox_user(policy)?; + + // Create read_write directories and chown newly-created ones to the + // sandbox user/group. Runs as the supervisor (root) before the child + // is forked so the workload sees writable paths it owns. + #[cfg(unix)] + crate::process::prepare_filesystem(policy)?; + + // Eagerly fetch initial settings and install the agent skill if the + // proposals flag is on at startup, rather than waiting for the policy + // poll loop's first tick. In offline/file-mode there is no gateway, so + // the flag stays at its default (false) and no skill is installed. + install_initial_agent_skill(sandbox_id, openshell_endpoint).await; + + // Install the supervisor seccomp prelude before spawning any workload-side + // tasks. By this point the orchestrator has finished privileged startup + // helpers (network namespace setup, nftables probes via run_networking), + // and the SSH listener and entrypoint child have not been exposed yet. + crate::sandbox::apply_supervisor_startup_hardening()?; + + // Verify the runtime PID limit can accommodate the policy's pid_max. + #[cfg(target_os = "linux")] + { + let pid_limit_mode = if std::env::var_os("OPENSHELL_REQUIRE_RUNTIME_PID_LIMIT").is_some() { + crate::process::RuntimePidLimitMode::Require + } else { + crate::process::RuntimePidLimitMode::Warn + }; + crate::process::check_runtime_pid_limit(pid_limit_mode)?; + } + + // Zombie reaper — openshell-sandbox may run as PID 1 in containers and + // must reap orphaned grandchildren (e.g. background daemons started by + // coding agents) to prevent zombie accumulation. + // + // Use waitid(..., WNOWAIT) so we can inspect exited children before + // actually reaping them. This avoids racing explicit `child.wait()` calls + // for managed children (entrypoint and SSH session processes). + #[cfg(target_os = "linux")] + tokio::spawn(async { + use nix::sys::wait::{Id, WaitPidFlag, WaitStatus, waitid, waitpid}; + use tokio::signal::unix::{SignalKind, signal}; + use tokio::time::MissedTickBehavior; + + let mut sigchld = match signal(SignalKind::child()) { + Ok(s) => s, + Err(e) => { + tracing::warn!(error = %e, "Failed to register SIGCHLD handler for zombie reaping"); + return; + } + }; + let mut retry = tokio::time::interval(Duration::from_secs(5)); + retry.set_missed_tick_behavior(MissedTickBehavior::Skip); + + loop { + tokio::select! { + _ = sigchld.recv() => {} + _ = retry.tick() => {} + } + + loop { + let status = match waitid( + Id::All, + WaitPidFlag::WEXITED | WaitPidFlag::WNOHANG | WaitPidFlag::WNOWAIT, + ) { + Ok(WaitStatus::StillAlive) | Err(nix::errno::Errno::ECHILD) => break, + Ok(status) => status, + Err(nix::errno::Errno::EINTR) => continue, + Err(e) => { + tracing::debug!(error = %e, "waitid error during zombie reaping"); + break; + } + }; + + let Some(pid) = status.pid() else { + break; + }; + + if managed_children::is_managed(pid.as_raw()) { + // Let the explicit waiter own this child status. + break; + } + + match waitpid(pid, Some(WaitPidFlag::WNOHANG)) { + Ok(WaitStatus::StillAlive) + | Err(nix::errno::Errno::ECHILD | nix::errno::Errno::EINTR) => {} + Ok(reaped) => { + tracing::debug!(?reaped, "Reaped orphaned child process"); + } + Err(e) => { + tracing::debug!(error = %e, "waitpid error during orphan reap"); + break; + } + } + } + } + }); + + let ssh_socket_path: Option = ssh_socket_path.map(std::path::PathBuf::from); + if let Some(listen_path) = ssh_socket_path.clone() { + let policy_clone = policy.clone(); + let workdir_clone = workdir.map(str::to_string); + let proxy_url = ssh_proxy_url; + let netns_fd = ssh_netns_fd; + let ca_paths = ca_file_paths.clone(); + let provider_credentials_clone = provider_credentials.clone(); + + let (ssh_ready_tx, ssh_ready_rx) = tokio::sync::oneshot::channel(); + + tokio::spawn(async move { + if let Err(err) = crate::ssh::run_ssh_server( + listen_path, + ssh_ready_tx, + policy_clone, + workdir_clone, + netns_fd, + proxy_url, + ca_paths, + provider_credentials_clone, + ) + .await + { + ocsf_emit!( + AppLifecycleBuilder::new(ocsf_ctx()) + .activity(ActivityId::Fail) + .severity(SeverityId::Critical) + .status(StatusId::Failure) + .message(format!("SSH server failed: {err}")) + .build() + ); + } + }); + + // Wait for the SSH server to bind its socket before spawning the + // entrypoint process. This prevents exec requests from racing against + // SSH server startup when Kubernetes marks the pod Ready. + match timeout(Duration::from_secs(10), ssh_ready_rx).await { + Ok(Ok(Ok(()))) => { + ocsf_emit!( + AppLifecycleBuilder::new(ocsf_ctx()) + .activity(ActivityId::Open) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .message("SSH server is ready to accept connections") + .build() + ); + } + Ok(Ok(Err(err))) => { + return Err(err.context("SSH server failed during startup")); + } + Ok(Err(_)) => { + return Err(miette::miette!( + "SSH server task panicked before signaling ready" + )); + } + Err(_) => { + return Err(miette::miette!( + "SSH server did not start within 10 seconds" + )); + } + } + } + + // Spawn the persistent supervisor session if we have a gateway endpoint + // and sandbox identity. The session provides relay channels for SSH + // connect and ExecSandbox through the gateway. + if let (Some(endpoint), Some(id), Some(socket)) = + (openshell_endpoint, sandbox_id, ssh_socket_path.as_ref()) + { + crate::supervisor_session::spawn( + endpoint.to_string(), + id.to_string(), + socket.clone(), + ssh_netns_fd, + ); + info!("supervisor session task spawned"); + } + + #[cfg(target_os = "linux")] + let mut handle = ProcessHandle::spawn( + program, + args, + workdir, + interactive, + policy, + netns, + ca_file_paths.as_ref(), + &provider_env, + )?; + + #[cfg(not(target_os = "linux"))] + let mut handle = ProcessHandle::spawn( + program, + args, + workdir, + interactive, + policy, + ca_file_paths.as_ref(), + &provider_env, + )?; + + // Store the entrypoint PID so the proxy can resolve TCP peer identity + entrypoint_pid.store(handle.pid(), Ordering::Release); + ocsf_emit!( + ProcessActivityBuilder::new(ocsf_ctx()) + .activity(ActivityId::Open) + .action(ActionId::Allowed) + .disposition(DispositionId::Allowed) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .launch_type(LaunchTypeId::Spawn) + .process(OcsfProcess::new(program, i64::from(handle.pid()))) + .message(format!("Process started: pid={}", handle.pid())) + .build() + ); + + // Wait for process with optional timeout + let result = if timeout_secs > 0 { + if let Ok(result) = timeout(Duration::from_secs(timeout_secs), handle.wait()).await { + result + } else { + ocsf_emit!( + ProcessActivityBuilder::new(ocsf_ctx()) + .activity(ActivityId::Close) + .action(ActionId::Denied) + .disposition(DispositionId::Blocked) + .severity(SeverityId::Critical) + .status(StatusId::Failure) + .message("Process timed out, killing") + .build() + ); + handle.kill()?; + return Ok(124); // Standard timeout exit code + } + } else { + handle.wait().await + }; + + let status = result.into_diagnostic()?; + + ocsf_emit!( + ProcessActivityBuilder::new(ocsf_ctx()) + .activity(ActivityId::Close) + .action(ActionId::Allowed) + .disposition(DispositionId::Allowed) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .exit_code(status.code()) + .message(format!("Process exited with code {}", status.code())) + .build() + ); + + Ok(status.code()) +} + +/// Eagerly fetch initial settings and install the agent-driven policy +/// proposal skill if the flag is on at startup. +/// +/// Without this, the skill would only get installed on the policy poll +/// loop's first false→true transition, which can be ~10 s after launch — +/// long enough for an agent to start running without seeing it. +/// +/// Best-effort: any failure (no gateway, RPC error, install failure) is +/// logged but does not fail sandbox startup. +async fn install_initial_agent_skill(sandbox_id: Option<&str>, openshell_endpoint: Option<&str>) { + use openshell_core::proto::setting_value; + use std::sync::atomic::Ordering; + + let Some(flag) = openshell_core::proposals::AGENT_PROPOSALS_ENABLED.get() else { + // The orchestrator is responsible for setting the OnceLock before + // calling run_process. If it isn't set, behave as if the flag is + // off and skip the install. + tracing::debug!("AGENT_PROPOSALS_ENABLED not initialized; skipping skill install"); + return; + }; + + if let (Some(id), Some(endpoint)) = (sandbox_id, openshell_endpoint) + && let Ok(client) = + openshell_core::grpc_client::CachedOpenShellClient::connect(endpoint).await + && let Ok(result) = client.poll_settings(id).await + { + let initial = result + .settings + .get(openshell_core::settings::AGENT_POLICY_PROPOSALS_ENABLED_KEY) + .and_then(|es| es.value.as_ref()) + .and_then(|sv| sv.value.as_ref()) + .and_then(|v| match v { + setting_value::Value::BoolValue(b) => Some(*b), + _ => None, + }) + .unwrap_or(false); + flag.store(initial, Ordering::Relaxed); + } + + if openshell_core::proposals::agent_proposals_enabled() { + match crate::skills::install_static_skills() { + Ok(installed) => info!( + path = %installed.policy_advisor.display(), + "Installed sandbox agent skill" + ), + Err(error) => tracing::warn!( + error = %error, + "Failed to install sandbox agent skill" + ), + } + } else { + tracing::debug!( + "agent_policy_proposals_enabled is false at startup; skipping skill install" + ); + } +} diff --git a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs b/crates/openshell-supervisor-process/src/sandbox/linux/landlock.rs similarity index 97% rename from crates/openshell-sandbox/src/sandbox/linux/landlock.rs rename to crates/openshell-supervisor-process/src/sandbox/linux/landlock.rs index 6b121e0ca..8808a1a87 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs +++ b/crates/openshell-supervisor-process/src/sandbox/linux/landlock.rs @@ -3,12 +3,12 @@ //! Landlock filesystem sandboxing. -use crate::policy::{LandlockCompatibility, SandboxPolicy}; use landlock::{ ABI, Access, AccessFs, CompatLevel, Compatible, PathBeneath, PathFd, PathFdError, Ruleset, RulesetAttr, RulesetCreatedAttr, }; use miette::{IntoDiagnostic, Result}; +use openshell_core::policy::{LandlockCompatibility, SandboxPolicy}; use std::path::{Path, PathBuf}; use tracing::debug; @@ -128,7 +128,7 @@ pub fn prepare(policy: &SandboxPolicy, workdir: Option<&str>) -> Result { openshell_ocsf::ocsf_emit!( - openshell_ocsf::DetectionFindingBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::DetectionFindingBuilder::new(openshell_ocsf::ctx::ctx()) .activity(openshell_ocsf::ActivityId::Open) .severity(openshell_ocsf::SeverityId::High) .confidence(openshell_ocsf::ConfidenceId::High) @@ -161,7 +161,7 @@ pub fn prepare(policy: &SandboxPolicy, workdir: Option<&str>) -> Result) -> Result) -> Result { if matches!(compatibility, LandlockCompatibility::BestEffort) { openshell_ocsf::ocsf_emit!( - openshell_ocsf::DetectionFindingBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::DetectionFindingBuilder::new(openshell_ocsf::ctx::ctx()) .activity(openshell_ocsf::ActivityId::Open) .severity(openshell_ocsf::SeverityId::High) .confidence(openshell_ocsf::ConfidenceId::High) @@ -278,7 +278,7 @@ pub fn enforce(prepared: PreparedRuleset) -> Result<()> { if let Err(err) = result { if matches!(prepared.compatibility, LandlockCompatibility::BestEffort) { openshell_ocsf::ocsf_emit!( - openshell_ocsf::DetectionFindingBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::DetectionFindingBuilder::new(openshell_ocsf::ctx::ctx()) .activity(openshell_ocsf::ActivityId::Open) .severity(openshell_ocsf::SeverityId::High) .confidence(openshell_ocsf::ConfidenceId::High) @@ -354,7 +354,7 @@ fn try_open_path(path: &Path, compatibility: &LandlockCompatibility) -> Result) { if total_paths == 0 { openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Informational) .status(openshell_ocsf::StatusId::Success) .state(openshell_ocsf::StateId::Other, "skipped") @@ -100,7 +98,7 @@ pub fn log_sandbox_readiness(policy: &SandboxPolicy, workdir: Option<&str>) { let availability = landlock::probe_availability(); if let landlock::LandlockAvailability::Available { abi } = &availability { openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) .severity(openshell_ocsf::SeverityId::Informational) .status(openshell_ocsf::StatusId::Success) .state(openshell_ocsf::StateId::Enabled, "probed") @@ -118,7 +116,7 @@ pub fn log_sandbox_readiness(policy: &SandboxPolicy, workdir: Option<&str>) { // previously invisible because it only fired inside pre_exec. let is_best_effort = matches!( policy.landlock.compatibility, - crate::policy::LandlockCompatibility::BestEffort + openshell_core::policy::LandlockCompatibility::BestEffort ); let (desc, msg) = if is_best_effort { ( @@ -149,7 +147,7 @@ pub fn log_sandbox_readiness(policy: &SandboxPolicy, workdir: Option<&str>) { ) }; openshell_ocsf::ocsf_emit!( - openshell_ocsf::DetectionFindingBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::DetectionFindingBuilder::new(openshell_ocsf::ctx::ctx()) .activity(openshell_ocsf::ActivityId::Open) .severity(openshell_ocsf::SeverityId::High) .confidence(openshell_ocsf::ConfidenceId::High) diff --git a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs b/crates/openshell-supervisor-process/src/sandbox/linux/seccomp.rs similarity index 99% rename from crates/openshell-sandbox/src/sandbox/linux/seccomp.rs rename to crates/openshell-supervisor-process/src/sandbox/linux/seccomp.rs index 1044623f5..4933cd181 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs +++ b/crates/openshell-supervisor-process/src/sandbox/linux/seccomp.rs @@ -12,8 +12,8 @@ //! needed syscalls (`execveat+AT_EMPTY_PATH`, `unshare+CLONE_NEWUSER`, //! `seccomp+SET_MODE_FILTER`) -use crate::policy::{NetworkMode, SandboxPolicy}; use miette::{IntoDiagnostic, Result}; +use openshell_core::policy::{NetworkMode, SandboxPolicy}; use seccompiler::{ SeccompAction, SeccompCmpArgLen, SeccompCmpOp, SeccompCondition, SeccompFilter, SeccompRule, apply_filter, apply_filter_all_threads, diff --git a/crates/openshell-sandbox/src/sandbox/mod.rs b/crates/openshell-supervisor-process/src/sandbox/mod.rs similarity index 93% rename from crates/openshell-sandbox/src/sandbox/mod.rs rename to crates/openshell-supervisor-process/src/sandbox/mod.rs index 95aeae492..ff44f8ba1 100644 --- a/crates/openshell-sandbox/src/sandbox/mod.rs +++ b/crates/openshell-supervisor-process/src/sandbox/mod.rs @@ -3,8 +3,8 @@ //! Platform sandboxing implementation. -use crate::policy::SandboxPolicy; use miette::Result; +use openshell_core::policy::SandboxPolicy; #[cfg(target_os = "linux")] pub mod linux; @@ -28,7 +28,7 @@ pub fn apply(policy: &SandboxPolicy, workdir: Option<&str>) -> Result<()> { { let _ = (policy, workdir); openshell_ocsf::ocsf_emit!( - openshell_ocsf::DetectionFindingBuilder::new(crate::ocsf_ctx()) + openshell_ocsf::DetectionFindingBuilder::new(openshell_ocsf::ctx::ctx()) .activity(openshell_ocsf::ActivityId::Open) .severity(openshell_ocsf::SeverityId::Medium) .finding_info(openshell_ocsf::FindingInfo::new( diff --git a/crates/openshell-sandbox/src/skills.rs b/crates/openshell-supervisor-process/src/skills.rs similarity index 100% rename from crates/openshell-sandbox/src/skills.rs rename to crates/openshell-supervisor-process/src/skills.rs diff --git a/crates/openshell-sandbox/src/skills/policy-advisor/SKILL.md b/crates/openshell-supervisor-process/src/skills/policy-advisor/SKILL.md similarity index 100% rename from crates/openshell-sandbox/src/skills/policy-advisor/SKILL.md rename to crates/openshell-supervisor-process/src/skills/policy-advisor/SKILL.md diff --git a/crates/openshell-sandbox/src/skills/policy_advisor.md b/crates/openshell-supervisor-process/src/skills/policy_advisor.md similarity index 100% rename from crates/openshell-sandbox/src/skills/policy_advisor.md rename to crates/openshell-supervisor-process/src/skills/policy_advisor.md diff --git a/crates/openshell-sandbox/src/ssh.rs b/crates/openshell-supervisor-process/src/ssh.rs similarity index 98% rename from crates/openshell-sandbox/src/ssh.rs rename to crates/openshell-supervisor-process/src/ssh.rs index 67fbc7e57..366e45def 100644 --- a/crates/openshell-sandbox/src/ssh.rs +++ b/crates/openshell-supervisor-process/src/ssh.rs @@ -4,15 +4,15 @@ //! Embedded SSH server for sandbox access. use crate::child_env; -use crate::policy::SandboxPolicy; +#[cfg(target_os = "linux")] +use crate::managed_children; use crate::process::drop_privileges; -use crate::provider_credentials::ProviderCredentialState; use crate::sandbox; -#[cfg(target_os = "linux")] -use crate::{register_managed_child, unregister_managed_child}; use miette::{IntoDiagnostic, Result}; use nix::pty::{Winsize, openpty}; use nix::unistd::setsid; +use openshell_core::policy::SandboxPolicy; +use openshell_core::provider_credentials::ProviderCredentialState; use openshell_ocsf::{ ActionId, ActivityId, DispositionId, SeverityId, SshActivityBuilder, StatusId, ocsf_emit, }; @@ -86,7 +86,7 @@ fn ssh_server_init( } ocsf_emit!( - SshActivityBuilder::new(crate::ocsf_ctx()) + SshActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Listen) .severity(SeverityId::Informational) .status(StatusId::Success) @@ -146,7 +146,7 @@ pub async fn run_ssh_server( .await { ocsf_emit!( - SshActivityBuilder::new(crate::ocsf_ctx()) + SshActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -173,7 +173,7 @@ async fn handle_connection( // not by an application-level preface. The supervisor bridges the // gateway's RelayStream directly into this socket. ocsf_emit!( - SshActivityBuilder::new(crate::ocsf_ctx()) + SshActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Open) .action(ActionId::Allowed) .disposition(DispositionId::Allowed) @@ -292,7 +292,7 @@ impl russh::server::Handler for SshHandler { // uses u32 for ports, but valid TCP ports are 0-65535. Without this // check, port 65537 truncates to port 1 (privileged). if port_to_connect > u32::from(u16::MAX) { - ocsf_emit!(SshActivityBuilder::new(crate::ocsf_ctx()) + ocsf_emit!(SshActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Refuse) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -307,7 +307,7 @@ impl russh::server::Handler for SshHandler { // Only allow forwarding to loopback destinations to prevent the // sandbox SSH server from being used as a generic proxy. if !is_loopback_host(host_to_connect) { - ocsf_emit!(SshActivityBuilder::new(crate::ocsf_ctx()) + ocsf_emit!(SshActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Refuse) .action(ActionId::Denied) .disposition(DispositionId::Blocked) @@ -331,7 +331,7 @@ impl russh::server::Handler for SshHandler { Ok(stream) => stream, Err(err) => { ocsf_emit!( - SshActivityBuilder::new(crate::ocsf_ctx()) + SshActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) .severity(SeverityId::Low) .status(StatusId::Failure) @@ -465,7 +465,7 @@ impl russh::server::Handler for SshHandler { state.input_sender = Some(input_sender); } else { ocsf_emit!( - SshActivityBuilder::new(crate::ocsf_ctx()) + SshActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Refuse) .action(ActionId::Denied) .disposition(DispositionId::Rejected) @@ -797,7 +797,7 @@ fn spawn_pty_shell( #[cfg(target_os = "linux")] let child_pid = child.id(); #[cfg(target_os = "linux")] - register_managed_child(child_pid); + managed_children::register(child_pid); let master_file = master; let (sender, receiver) = mpsc::channel::>(); @@ -843,7 +843,7 @@ fn spawn_pty_shell( std::thread::spawn(move || { let status = child.wait().ok(); #[cfg(target_os = "linux")] - unregister_managed_child(child_pid); + managed_children::unregister(child_pid); let code = status.and_then(|s| s.code()).unwrap_or(1).unsigned_abs(); // Wait for the reader thread to finish forwarding all output before // sending exit-status and closing the channel. This prevents the @@ -943,7 +943,7 @@ fn spawn_pipe_exec( #[cfg(target_os = "linux")] let child_pid = child.id(); #[cfg(target_os = "linux")] - register_managed_child(child_pid); + managed_children::register(child_pid); let child_stdin = child.stdin.take(); let child_stdout = child.stdout.take().expect("stdout must be piped"); @@ -1015,7 +1015,7 @@ fn spawn_pipe_exec( std::thread::spawn(move || { let status = child.wait().ok(); #[cfg(target_os = "linux")] - unregister_managed_child(child_pid); + managed_children::unregister(child_pid); let code = status.and_then(|s| s.code()).unwrap_or(1).unsigned_abs(); // Wait for both reader threads. let _ = reader_done_rx.recv_timeout(Duration::from_secs(2)); @@ -1470,7 +1470,7 @@ mod tests { #[cfg(unix)] #[test] fn pre_exec_always_calls_drop_privileges() { - use crate::policy::{ + use openshell_core::policy::{ FilesystemPolicy, LandlockPolicy, NetworkPolicy, ProcessPolicy, SandboxPolicy, }; diff --git a/crates/openshell-sandbox/src/supervisor_session.rs b/crates/openshell-supervisor-process/src/supervisor_session.rs similarity index 96% rename from crates/openshell-sandbox/src/supervisor_session.rs rename to crates/openshell-supervisor-process/src/supervisor_session.rs index 4d7392ee3..7c524676c 100644 --- a/crates/openshell-sandbox/src/supervisor_session.rs +++ b/crates/openshell-supervisor-process/src/supervisor_session.rs @@ -30,7 +30,7 @@ use tokio::sync::mpsc; use tokio_stream::StreamExt; use tracing::{debug, warn}; -use crate::grpc_client; +use openshell_core::grpc_client; const INITIAL_BACKOFF: Duration = Duration::from_secs(1); const MAX_BACKOFF: Duration = Duration::from_secs(30); @@ -258,13 +258,18 @@ async fn run_session_loop( match run_single_session(&endpoint, &sandbox_id, &ssh_socket_path, netns_fd).await { Ok(()) => { - let event = session_closed_event(crate::ocsf_ctx(), &endpoint, &sandbox_id); + let event = + session_closed_event(openshell_ocsf::ctx::ctx(), &endpoint, &sandbox_id); ocsf_emit!(event); break; } Err(e) => { - let event = - session_failed_event(crate::ocsf_ctx(), &endpoint, attempt, &e.to_string()); + let event = session_failed_event( + openshell_ocsf::ctx::ctx(), + &endpoint, + attempt, + &e.to_string(), + ); ocsf_emit!(event); tokio::time::sleep(backoff).await; backoff = (backoff * 2).min(MAX_BACKOFF); @@ -326,7 +331,7 @@ async fn run_single_session( let heartbeat_secs = accepted.heartbeat_interval_secs.max(5); let event = session_established_event( - crate::ocsf_ctx(), + openshell_ocsf::ctx::ctx(), endpoint, &accepted.session_id, heartbeat_secs, @@ -385,20 +390,23 @@ fn handle_gateway_message( let ssh_socket_path = ssh_socket_path.to_path_buf(); let tx = tx.clone(); - let event = relay_open_event(crate::ocsf_ctx(), &relay_open, &ssh_socket_path); + let event = relay_open_event(openshell_ocsf::ctx::ctx(), &relay_open, &ssh_socket_path); ocsf_emit!(event); tokio::spawn(async move { let event_open = relay_open.clone(); match handle_relay_open(relay_open, &ssh_socket_path, netns_fd, channel, tx).await { Ok(()) => { - let event = - relay_closed_event(crate::ocsf_ctx(), &event_open, &ssh_socket_path); + let event = relay_closed_event( + openshell_ocsf::ctx::ctx(), + &event_open, + &ssh_socket_path, + ); ocsf_emit!(event); } Err(e) => { let event = relay_failed_event( - crate::ocsf_ctx(), + openshell_ocsf::ctx::ctx(), &event_open, &ssh_socket_path, &e.to_string(), @@ -415,8 +423,11 @@ fn handle_gateway_message( }); } Some(gateway_message::Payload::RelayClose(close)) => { - let event = - relay_close_from_gateway_event(crate::ocsf_ctx(), &close.channel_id, &close.reason); + let event = relay_close_from_gateway_event( + openshell_ocsf::ctx::ctx(), + &close.channel_id, + &close.reason, + ); ocsf_emit!(event); } _ => {