Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions architecture/compute-runtimes.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ reason strings.
| Podman | Rootless or single-machine deployments. | Container plus nested sandbox namespace. | Uses the Podman REST API, OCI image volumes, and CDI GPU devices when available. |
| Kubernetes | Cluster deployment through Helm. | Pod plus nested sandbox namespace. | Uses Kubernetes API objects, service accounts, secrets, PVC-backed workspace storage, and GPU resources. |
| VM | Experimental microVM isolation. | Per-sandbox libkrun VM. | Gateway spawns `openshell-driver-vm` as a subprocess over a private, state-local Unix socket. The VM driver boots a cached bootstrap `rootfs.ext4`, prepares requested OCI images inside a bootstrap VM with `umoci`, attaches the prepared image disk read-only, and gives each sandbox a writable `overlay.ext4` for merged-root changes and runtime material. The driver persists each accepted launch request beside the overlay and restarts those VMs on driver startup without recreating the overlay. |
| External | Out-of-tree compute drivers. | Defined by the operator-supplied driver. | Gateway connects a `tonic::Channel` to a Unix domain socket served by an external driver process speaking the existing `compute_driver.proto` contract. Activated by `--compute-driver-socket=<path>` (env `OPENSHELL_COMPUTE_DRIVER_SOCKET`); skips both the `--drivers` list and auto-detection. The operator is responsible for the driver process lifecycle and the socket's filesystem permissions, mirroring the trust boundary of `--drivers vm`. |

Per-sandbox CPU and memory values currently enter the driver layer through
template resource limits. Docker and Podman apply them as runtime limits.
Expand Down
115 changes: 110 additions & 5 deletions crates/openshell-core/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,44 +40,76 @@ pub const DEFAULT_SUPERVISOR_IMAGE: &str = "ghcr.io/nvidia/openshell/supervisor:
pub const CDI_GPU_DEVICE_ALL: &str = "nvidia.com/gpu=all";

/// Compute backends the gateway can orchestrate sandboxes through.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
//
// Note: this enum is NOT `Copy` because the `External` variant carries a
// `PathBuf`. Upstream's earlier shape derived `Copy` and had `const fn
// as_str(self)`; both are reverted here for the same reason — `as_str`
// takes `&self` and is a non-`const` fn so the match can dispatch on
// the boxed variant. All in-tree call sites already handle `Clone`.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ComputeDriverKind {
Kubernetes,
Vm,
Docker,
Podman,
/// Out-of-process compute driver speaking the gRPC `compute_driver.proto`
/// contract over a Unix domain socket. The path is supplied by
/// `--compute-driver-socket` or `OPENSHELL_COMPUTE_DRIVER_SOCKET`.
External(PathBuf),
}

impl ComputeDriverKind {
#[must_use]
pub const fn as_str(self) -> &'static str {
pub fn as_str(&self) -> &'static str {
match self {
Self::Kubernetes => "kubernetes",
Self::Vm => "vm",
Self::Docker => "docker",
Self::Podman => "podman",
Self::External(_) => "external",
}
}
}

impl fmt::Display for ComputeDriverKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(self.as_str())
match self {
Self::Kubernetes | Self::Vm | Self::Docker | Self::Podman => f.write_str(self.as_str()),
Self::External(path) => write!(f, "external:{}", path.display()),
}
}
}

impl FromStr for ComputeDriverKind {
type Err = String;

fn from_str(value: &str) -> Result<Self, Self::Err> {
match value.trim().to_ascii_lowercase().as_str() {
let trimmed = value.trim();
let lower = trimmed.to_ascii_lowercase();
if let Some(suffix_lower) = lower.strip_prefix("external:") {
// Use the case-preserving suffix for the path.
let suffix = &trimmed[trimmed.len() - suffix_lower.len()..];
if suffix.is_empty() {
return Err(
"compute driver 'external:' requires a non-empty socket path \
(e.g. 'external:/var/run/openshell-driver.sock')"
.to_string(),
);
}
return Ok(Self::External(PathBuf::from(suffix)));
}
match lower.as_str() {
"kubernetes" => Ok(Self::Kubernetes),
"vm" => Ok(Self::Vm),
"docker" => Ok(Self::Docker),
"podman" => Ok(Self::Podman),
"external" => Err(
"compute driver 'external' requires a socket path: 'external:/path/to/driver.sock' (or set --compute-driver-socket)"
.to_string(),
),
other => Err(format!(
"unsupported compute driver '{other}'. expected one of: kubernetes, vm, docker, podman"
"unsupported compute driver '{other}'. expected one of: kubernetes, vm, docker, podman, external:<path>"
)),
}
}
Expand Down Expand Up @@ -628,6 +660,41 @@ mod tests {
assert!(err.contains("unsupported compute driver 'firecracker'"));
}

#[test]
fn compute_driver_kind_external_displays_with_path() {
let kind = ComputeDriverKind::External(PathBuf::from("/x/y"));
assert_eq!(kind.to_string(), "external:/x/y");
}

#[test]
fn compute_driver_kind_parses_external_with_socket_path() {
let parsed: ComputeDriverKind = "external:/var/run/openshell-driver.sock".parse().unwrap();
match parsed {
ComputeDriverKind::External(path) => {
assert_eq!(path, PathBuf::from("/var/run/openshell-driver.sock"));
}
other => panic!("expected External(_), got {other:?}"),
}
}

#[test]
fn compute_driver_kind_rejects_bare_external_without_path() {
let err = "external".parse::<ComputeDriverKind>().unwrap_err();
assert!(
err.contains("requires a socket path"),
"missing socket-path hint in error: {err}"
);
}

#[test]
fn compute_driver_kind_unknown_error_lists_external_in_supported() {
let err = "unknown".parse::<ComputeDriverKind>().unwrap_err();
assert!(
err.contains("external:<path>"),
"expected supported list to mention external:<path>, got: {err}"
);
}

#[test]
fn config_defaults_to_loopback_bind_address() {
let expected: SocketAddr = "127.0.0.1:17670".parse().expect("valid address");
Expand Down Expand Up @@ -754,4 +821,42 @@ mod tests {
}
}
}

#[test]
fn compute_driver_kind_display_roundtrips_through_from_str() {
use std::path::PathBuf;
for kind in [
ComputeDriverKind::Kubernetes,
ComputeDriverKind::Vm,
ComputeDriverKind::Docker,
ComputeDriverKind::Podman,
ComputeDriverKind::External(PathBuf::from("/var/run/openshell-driver.sock")),
] {
let s = kind.to_string();
let parsed: ComputeDriverKind = s.parse().expect("round-trip parse");
assert_eq!(parsed, kind, "round-trip mismatch for {s}");
}
}

#[test]
fn compute_driver_kind_rejects_external_with_empty_path() {
let err = "external:".parse::<ComputeDriverKind>().unwrap_err();
assert!(
err.contains("non-empty socket path"),
"unexpected error: {err}"
);
}

#[test]
fn compute_driver_kind_external_is_case_insensitive_on_prefix() {
let parsed: ComputeDriverKind = "External:/var/run/openshell-driver.sock"
.parse()
.expect("case-insensitive prefix should be accepted");
match parsed {
ComputeDriverKind::External(p) => {
assert_eq!(p, PathBuf::from("/var/run/openshell-driver.sock"));
}
other => panic!("expected External, got {other:?}"),
}
}
}
104 changes: 102 additions & 2 deletions crates/openshell-server/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,16 @@ struct RunArgs {
)]
drivers: Vec<ComputeDriverKind>,

/// Path to a Unix domain socket served by an external compute driver
/// implementing `compute_driver.proto`.
///
/// When set, the gateway uses `ComputeDriverKind::External(<path>)` and
/// skips both the `--drivers` list and the auto-detection probe. This
/// lets out-of-tree driver binaries (Kyma, custom backends) connect to
/// an already-running gateway without rebuilding it.
#[arg(long, env = "OPENSHELL_COMPUTE_DRIVER_SOCKET")]
compute_driver_socket: Option<PathBuf>,

/// Disable TLS entirely — listen on plaintext HTTP.
/// Use this when the gateway sits behind a reverse proxy or tunnel
/// (e.g. Cloudflare Tunnel) that terminates TLS at the edge.
Expand Down Expand Up @@ -350,9 +360,18 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> {
config = config.with_metrics_bind_address(addr);
}

// The --compute-driver-socket flag pins an external driver and overrides
// the --drivers list. `effective_single_driver` already mirrors this for
// pre-runtime checks; do the same here so `configured_compute_driver`
// sees the External entry when it inspects `config.compute_drivers`.
let configured_drivers = if let Some(socket) = args.compute_driver_socket.clone() {
vec![ComputeDriverKind::External(socket)]
} else {
args.drivers.clone()
};
config = config
.with_database_url(db_url)
.with_compute_drivers(args.drivers.clone())
.with_compute_drivers(configured_drivers)
.with_server_sans(args.server_sans.clone())
.with_loopback_service_http(args.enable_loopback_service_http);

Expand Down Expand Up @@ -611,9 +630,16 @@ fn merge_file_into_args(args: &mut RunArgs, file: &GatewayFileSection, matches:
}

fn effective_single_driver(args: &RunArgs) -> Option<ComputeDriverKind> {
// The --compute-driver-socket flag pins an out-of-tree driver and
// therefore wins over both the explicit --drivers list and auto-detection.
if let Some(socket) = args.compute_driver_socket.clone() {
return Some(ComputeDriverKind::External(socket));
}
match args.drivers.as_slice() {
// `ComputeDriverKind` isn't `Copy` (`External` holds a `PathBuf`);
// clone the singleton so this call site stays uniform.
[] => openshell_core::config::detect_driver(),
[driver] => Some(*driver),
[driver] => Some(driver.clone()),
_ => None,
}
}
Expand Down Expand Up @@ -1428,6 +1454,80 @@ enable_loopback_service_http = false
);
}

#[test]
fn compute_driver_socket_flag_yields_external_driver() {
// The CLI flag pins ComputeDriverKind::External(<path>) so that
// out-of-tree drivers (Kyma, custom backends) can be wired without
// recompiling the gateway.
let _lock = ENV_LOCK
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner);
let _g1 = EnvVarGuard::remove("OPENSHELL_COMPUTE_DRIVER_SOCKET");
let _g2 = EnvVarGuard::remove("OPENSHELL_DRIVERS");

let (args, _) = parse_with_args(&[
"openshell-gateway",
"--db-url",
"sqlite::memory:",
"--compute-driver-socket",
"/tmp/openshell-driver.sock",
]);

match super::effective_single_driver(&args) {
Some(super::ComputeDriverKind::External(p)) => {
assert_eq!(p, std::path::PathBuf::from("/tmp/openshell-driver.sock"));
}
other => panic!("expected External, got {other:?}"),
}
}

#[test]
fn compute_driver_socket_flag_overrides_drivers_list() {
// Even when --drivers is set, --compute-driver-socket pins the
// external driver. This avoids forcing operators to wipe a
// gateway-wide --drivers list to add an out-of-tree driver.
let _lock = ENV_LOCK
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner);
let _g1 = EnvVarGuard::remove("OPENSHELL_COMPUTE_DRIVER_SOCKET");
let _g2 = EnvVarGuard::remove("OPENSHELL_DRIVERS");

let (args, _) = parse_with_args(&[
"openshell-gateway",
"--db-url",
"sqlite::memory:",
"--drivers",
"docker",
"--compute-driver-socket",
"/tmp/x.sock",
]);

match super::effective_single_driver(&args) {
Some(super::ComputeDriverKind::External(p)) => {
assert_eq!(p, std::path::PathBuf::from("/tmp/x.sock"));
}
other => panic!("expected External, got {other:?}"),
}
}

#[test]
fn compute_driver_socket_reads_from_env_var() {
let _lock = ENV_LOCK
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner);
let _g1 = EnvVarGuard::set("OPENSHELL_COMPUTE_DRIVER_SOCKET", "/run/external.sock");
let _g2 = EnvVarGuard::remove("OPENSHELL_DRIVERS");

let (args, _) = parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]);

match super::effective_single_driver(&args) {
Some(super::ComputeDriverKind::External(p)) => {
assert_eq!(p, std::path::PathBuf::from("/run/external.sock"));
}
other => panic!("expected External, got {other:?}"),
}
}

#[test]
fn driver_inherits_shared_image_from_gateway_section() {
// [openshell.gateway].default_image inherits into the K8s driver
Expand Down
70 changes: 70 additions & 0 deletions crates/openshell-server/src/compute/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,44 @@ impl ComputeDriver for RemoteComputeDriver {
}
}

/// Build a tonic [`Channel`] connected to a Unix domain socket served by an
/// external compute driver. Used by the `External(PathBuf)` dispatch arm in
/// `lib.rs::build_compute_runtime`. The dummy authority `http://[::]:50051`
/// matches the connector convention used by the VM driver — tonic ignores it
/// once a custom service connector is supplied.
#[cfg(unix)]
pub async fn connect_external_compute_driver(
socket_path: std::path::PathBuf,
) -> Result<Channel, openshell_core::Error> {
use hyper_util::rt::TokioIo;
use tokio::net::UnixStream;
use tonic::transport::Endpoint;
use tower::service_fn;

let display_path = socket_path.clone();
Endpoint::from_static("http://[::]:50051")
.connect_with_connector(service_fn(move |_: tonic::transport::Uri| {
let socket_path = socket_path.clone();
async move { UnixStream::connect(socket_path).await.map(TokioIo::new) }
}))
.await
.map_err(|e| {
openshell_core::Error::execution(format!(
"failed to connect to external compute driver socket '{}': {e}",
display_path.display()
))
})
}

#[cfg(not(unix))]
pub async fn connect_external_compute_driver(
_socket_path: std::path::PathBuf,
) -> Result<Channel, openshell_core::Error> {
Err(openshell_core::Error::config(
"the external compute driver requires unix domain socket support",
))
}

#[derive(Clone)]
pub struct ComputeRuntime {
driver: SharedComputeDriver,
Expand Down Expand Up @@ -373,6 +411,38 @@ impl ComputeRuntime {
.await
}

/// Build a `ComputeRuntime` over a tonic `Channel` connected to an
/// already-running external compute driver process.
///
/// Unlike [`new_remote_vm`], this constructor does not own a child
/// process — the external driver's lifecycle is the operator's
/// responsibility (systemd unit, sidecar container, etc.). The
/// underlying `RemoteComputeDriver` proxy is identical.
pub(crate) async fn new_remote_external(
channel: Channel,
store: Arc<Store>,
sandbox_index: SandboxIndex,
sandbox_watch_bus: SandboxWatchBus,
tracing_log_bus: TracingLogBus,
supervisor_sessions: Arc<SupervisorSessionRegistry>,
) -> Result<Self, ComputeError> {
let driver: SharedComputeDriver = Arc::new(RemoteComputeDriver::new(channel));
Self::from_driver(
driver,
None,
None,
None,
store,
sandbox_index,
sandbox_watch_bus,
tracing_log_bus,
supervisor_sessions,
true,
Vec::new(),
)
.await
}

pub async fn new_podman(
config: PodmanComputeConfig,
store: Arc<Store>,
Expand Down
Loading
Loading