diff --git a/Cargo.lock b/Cargo.lock index f0f7e6f8c..82819789c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3188,7 +3188,6 @@ dependencies = [ "tokio-stream", "tonic", "tracing", - "url", ] [[package]] diff --git a/architecture/README.md b/architecture/README.md index 36b0a4978..96e5f49e2 100644 --- a/architecture/README.md +++ b/architecture/README.md @@ -299,6 +299,7 @@ This opens an interactive SSH session into the sandbox, with all provider creden | [Docs Site Architecture](docs-site.md) | Documentation source layout, navigation structure, local validation and preview workflow, and publish pipeline. | | [Policy Language](security-policy.md) | The YAML/Rego policy system that governs sandbox behavior. | | [Inference Routing](inference-routing.md) | Transparent interception and sandbox-local routing of AI inference API calls to configured backends. | +| [Docker Driver](docker-driver.md) | Docker compute driver implementation, host networking, loopback gateway connectivity. | | [System Architecture](system-architecture.md) | Top-level system architecture diagram with all deployable components and communication flows. | | [Gateway Settings Channel](gateway-settings.md) | Runtime settings channel: two-tier key-value configuration, global policy override, settings registry, CLI/TUI commands. | | [TUI](tui.md) | Terminal user interface for sandbox interaction. | diff --git a/architecture/docker-driver.md b/architecture/docker-driver.md new file mode 100644 index 000000000..25d48f740 --- /dev/null +++ b/architecture/docker-driver.md @@ -0,0 +1,129 @@ +# Docker Driver + +The Docker compute driver manages sandbox containers through the local Docker +daemon using the `bollard` client. It targets local developer environments +where running a full Kubernetes cluster is unnecessary but Docker is already +available. + +The gateway remains a host process. Each sandbox container bind-mounts a Linux +`openshell-sandbox` supervisor binary and uses Docker host networking so the +supervisor can connect to a gateway that is listening on host loopback without +requiring an additional bridge-reachable listener on Linux. + +## Source Map + +| Path | Purpose | +|---|---| +| `crates/openshell-driver-docker/src/lib.rs` | Docker compute driver implementation | +| `crates/openshell-driver-docker/src/tests.rs` | Unit tests for container spec, env, TLS paths, GPU, resource limits, and cache helpers | +| `crates/openshell-server/src/cli.rs` | Gateway CLI flags for Docker driver configuration | +| `crates/openshell-server/src/lib.rs` | In-process Docker compute runtime wiring | + +## Runtime Model + +```mermaid +flowchart LR + CLI["OpenShell CLI
host"] -->|gRPC/HTTP
127.0.0.1:8080| GW["Gateway
host process"] + GW -->|Docker API| DA["Docker daemon"] + DA --> C["Sandbox container
network_mode=host"] + C --> SV["openshell-sandbox
supervisor"] + SV -->|ConnectSupervisor
OPENSHELL_ENDPOINT| GW + SV --> NS["Nested sandbox netns
workload + policy proxy"] +``` + +The Docker container itself uses `network_mode = "host"`. This is intentional +for now: it makes a gateway bound to `127.0.0.1` reachable from the supervisor +as `127.0.0.1`, matching the host process' endpoint without a bridge listener, +NAT rule, or userland proxy. + +The container also gets a Docker-managed `/etc/hosts` entry for +`host.openshell.internal` that resolves to `127.0.0.1`. This gives callers a +stable OpenShell-owned hostname for host services without requiring changes to +the host machine's hosts file. + +The supervisor still creates a nested network namespace for the actual workload +and routes workload traffic through its policy proxy. Agent network requests are +enforced by the supervisor in that nested namespace. + +## Container Spec + +`build_container_create_body()` constructs the Docker container: + +| Field | Value | Reason | +|---|---|---| +| `image` | Sandbox template image | User-selected runtime image | +| `user` | `"0"` | Supervisor needs root inside the container for namespace and mount setup | +| `entrypoint` | `/opt/openshell/bin/openshell-sandbox` | Bind-mounted supervisor binary | +| `cmd` | Empty vector | Prevents image CMD args from being appended to the supervisor entrypoint | +| `network_mode` | `"host"` | Lets supervisor connect to host loopback gateway endpoints | +| `extra_hosts` | `host.openshell.internal:127.0.0.1` | Stable container-local alias for host loopback services | +| `cap_add` | `SYS_ADMIN`, `NET_ADMIN`, `SYS_PTRACE`, `SYSLOG` | Required for supervisor isolation setup and process inspection | +| `security_opt` | `apparmor=unconfined` | Docker's default AppArmor profile blocks mount operations required by network namespace setup | +| `restart_policy` | `unless-stopped` | Resume managed sandboxes after Docker or gateway restarts | +| `device_requests` | CDI all-GPU request when `spec.gpu` is true | Enables Docker CDI GPU sandboxes when daemon support is detected | + +## Gateway Callback + +The Docker driver injects `OPENSHELL_ENDPOINT` into each sandbox container from +`Config::grpc_endpoint` without rewriting it. This is the key difference from a +bridge-network design. + +Examples: + +```shell +OPENSHELL_GRPC_ENDPOINT=http://127.0.0.1:8080 +``` + +and: + +```shell +OPENSHELL_GRPC_ENDPOINT=https://127.0.0.1:8080 +``` + +are passed into the supervisor as-is. Because the container shares the host +network namespace, `127.0.0.1` resolves to the host loopback interface and the +gateway is reachable when it binds loopback. + +The endpoint can also use the stable alias: + +```shell +OPENSHELL_GRPC_ENDPOINT=http://host.openshell.internal:8080 +``` + +In host network mode this name resolves to `127.0.0.1` inside the container. + +For TLS endpoints, the gateway certificate must include the exact endpoint host +as a subject alternative name. For `https://127.0.0.1:8080`, the certificate +needs an IP SAN for `127.0.0.1`. For `https://localhost:8080`, it needs a DNS +SAN for `localhost`. For `https://host.openshell.internal:8080`, it needs a DNS +SAN for `host.openshell.internal`. Docker sandboxes also require client TLS +material: + +| Env / flag | Purpose | +|---|---| +| `OPENSHELL_DOCKER_TLS_CA` / `--docker-tls-ca` | CA certificate mounted at `/etc/openshell/tls/client/ca.crt` | +| `OPENSHELL_DOCKER_TLS_CERT` / `--docker-tls-cert` | Client certificate mounted at `/etc/openshell/tls/client/tls.crt` | +| `OPENSHELL_DOCKER_TLS_KEY` / `--docker-tls-key` | Client private key mounted at `/etc/openshell/tls/client/tls.key` | + +When `OPENSHELL_GRPC_ENDPOINT` uses `http://`, these TLS mounts are not +required and providing them is rejected. When it uses `https://`, all three are +required. + +## Environment + +`build_environment()` merges template environment, spec environment, and +driver-controlled keys. Driver-controlled keys win: + +| Variable | Value | +|---|---| +| `OPENSHELL_ENDPOINT` | Exact configured gateway endpoint | +| `OPENSHELL_SANDBOX_ID` | Sandbox id | +| `OPENSHELL_SANDBOX` | Sandbox name | +| `OPENSHELL_SSH_SOCKET_PATH` | Unix socket path used by the supervisor's embedded SSH daemon | +| `OPENSHELL_SANDBOX_COMMAND` | `sleep infinity` | +| `OPENSHELL_TLS_CA` | Mounted CA path when HTTPS is enabled | +| `OPENSHELL_TLS_CERT` | Mounted client cert path when HTTPS is enabled | +| `OPENSHELL_TLS_KEY` | Mounted client key path when HTTPS is enabled | + +The Docker driver does not inject `OPENSHELL_SSH_HANDSHAKE_SECRET`; the +supervisor-to-gateway path relies on mTLS for the Docker callback. diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index bfcb6cd09..04187934e 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -105,6 +105,15 @@ pub struct Config { #[serde(default)] pub metrics_bind_address: Option, + /// Additional bind addresses that serve the same multiplexed gRPC/HTTP + /// surface as `bind_address`. + /// + /// Compute drivers may register extra listeners during startup so that + /// sandbox workloads can call back into the gateway over an interface + /// that the operator-supplied `bind_address` does not expose. + #[serde(default)] + pub extra_bind_addresses: Vec, + /// Log level (trace, debug, info, warn, error). #[serde(default = "default_log_level")] pub log_level: String, @@ -292,6 +301,7 @@ impl Config { bind_address: default_bind_address(), health_bind_address: None, metrics_bind_address: None, + extra_bind_addresses: Vec::new(), log_level: default_log_level(), tls, oidc: None, @@ -333,6 +343,19 @@ impl Config { self } + /// Append an extra listener address to the multiplex service. + /// + /// Duplicate entries (matching `bind_address` or any existing entry) are + /// silently dropped so callers can naively push driver-derived addresses + /// without checking for collisions. + #[must_use] + pub fn with_extra_bind_address(mut self, addr: SocketAddr) -> Self { + if addr != self.bind_address && !self.extra_bind_addresses.contains(&addr) { + self.extra_bind_addresses.push(addr); + } + self + } + /// Create a new configuration with the given log level. #[must_use] pub fn with_log_level(mut self, level: impl Into) -> Self { diff --git a/crates/openshell-driver-docker/Cargo.toml b/crates/openshell-driver-docker/Cargo.toml index 8e2fd1777..ee917b78d 100644 --- a/crates/openshell-driver-docker/Cargo.toml +++ b/crates/openshell-driver-docker/Cargo.toml @@ -19,7 +19,6 @@ futures = { workspace = true } tokio-stream = { workspace = true } tracing = { workspace = true } bytes = { workspace = true } -url = { workspace = true } bollard = { version = "0.20" } tar = "0.4" tempfile = "3" diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 870a51ca6..8b8df5b89 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -38,7 +38,6 @@ use tokio::sync::{broadcast, mpsc}; use tokio_stream::wrappers::ReceiverStream; use tonic::{Request, Response, Status}; use tracing::{info, warn}; -use url::{Host, Url}; const WATCH_BUFFER: usize = 128; const WATCH_POLL_INTERVAL: Duration = Duration::from_secs(2); @@ -55,8 +54,7 @@ const TLS_CA_MOUNT_PATH: &str = "/etc/openshell/tls/client/ca.crt"; const TLS_CERT_MOUNT_PATH: &str = "/etc/openshell/tls/client/tls.crt"; const TLS_KEY_MOUNT_PATH: &str = "/etc/openshell/tls/client/tls.key"; const SANDBOX_COMMAND: &str = "sleep infinity"; -const HOST_OPENSHELL_INTERNAL: &str = "host.openshell.internal"; -const HOST_DOCKER_INTERNAL: &str = "host.docker.internal"; +const HOST_OPENSHELL_INTERNAL_HOSTS_ENTRY: &str = "host.openshell.internal:127.0.0.1"; /// Default image holding the Linux `openshell-sandbox` binary. The gateway /// pulls this image and extracts the binary to a host-side cache when no @@ -852,7 +850,7 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig environment.insert( "OPENSHELL_ENDPOINT".to_string(), - container_visible_openshell_endpoint(&config.grpc_endpoint), + config.grpc_endpoint.clone(), ); environment.insert("OPENSHELL_SANDBOX_ID".to_string(), sandbox.id.clone()); environment.insert("OPENSHELL_SANDBOX".to_string(), sandbox.name.clone()); @@ -950,17 +948,27 @@ fn build_container_create_body( "SYS_PTRACE".to_string(), "SYSLOG".to_string(), ]), - // AppArmor's default Docker profile blocks mount(2) with MS_SHARED - // even when SYS_ADMIN is granted, which prevents ip-netns from - // creating network namespaces for proxy-mode isolation. The sandbox - // enforces its own isolation via seccomp, Landlock, and network - // namespaces, so the host AppArmor profile adds no meaningful - // defence here. + // The sandbox supervisor needs to bind-mount `/run/netns`, + // mark it shared, and create per-process network namespaces. + // Docker's default AppArmor profile (`docker-default`) denies + // these mount operations even with CAP_SYS_ADMIN, so we opt + // out of AppArmor confinement for sandbox containers. The + // sandbox enforces its own security boundary via Landlock, + // seccomp, OPA policy evaluation, and the dedicated network + // namespace it sets up for the agent — AppArmor at the + // container layer is redundant relative to those controls + // and conflicts with them in this case. security_opt: Some(vec!["apparmor=unconfined".to_string()]), - extra_hosts: Some(vec![ - format!("{HOST_DOCKER_INTERNAL}:host-gateway"), - format!("{HOST_OPENSHELL_INTERNAL}:host-gateway"), - ]), + // Run in the host network namespace so a gateway bound to + // 127.0.0.1 is reachable from the supervisor as 127.0.0.1. + // The supervisor still creates a nested network namespace for + // the sandboxed workload and forces workload traffic through + // its policy proxy. + network_mode: Some("host".to_string()), + // Keep a stable host alias available inside the container without + // requiring users to edit the host's /etc/hosts. In host network + // mode this resolves back to the host loopback gateway. + extra_hosts: Some(vec![HOST_OPENSHELL_INTERNAL_HOSTS_ENTRY.to_string()]), ..Default::default() }), ..Default::default() @@ -991,25 +999,6 @@ fn sandbox_log_level(sandbox: &DriverSandbox, default_level: &str) -> String { .to_string() } -fn container_visible_openshell_endpoint(endpoint: &str) -> String { - let Ok(mut url) = Url::parse(endpoint) else { - return endpoint.to_string(); - }; - - let should_rewrite = match url.host() { - Some(Host::Ipv4(ip)) => ip.is_loopback() || ip.is_unspecified(), - Some(Host::Ipv6(ip)) => ip.is_loopback() || ip.is_unspecified(), - Some(Host::Domain(host)) => host.eq_ignore_ascii_case("localhost"), - None => false, - }; - - if should_rewrite && url.set_host(Some(HOST_OPENSHELL_INTERNAL)).is_ok() { - return url.to_string(); - } - - endpoint.to_string() -} - fn docker_resource_limits( template: &DriverSandboxTemplate, ) -> Result { diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index 16dc3f733..b016b31eb 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -57,19 +57,16 @@ fn runtime_config() -> DockerDriverRuntimeConfig { } #[test] -fn container_visible_endpoint_rewrites_loopback_hosts() { - assert_eq!( - container_visible_openshell_endpoint("https://localhost:8443"), - "https://host.openshell.internal:8443/" - ); - assert_eq!( - container_visible_openshell_endpoint("http://127.0.0.1:8080"), - "http://host.openshell.internal:8080/" - ); - assert_eq!( - container_visible_openshell_endpoint("https://gateway.internal:8443"), - "https://gateway.internal:8443" - ); +fn build_environment_preserves_loopback_endpoint_for_host_network() { + let mut config = runtime_config(); + config.grpc_endpoint = "http://127.0.0.1:8080".to_string(); + + let env = build_environment(&test_sandbox(), &config); + assert!(env.contains(&"OPENSHELL_ENDPOINT=http://127.0.0.1:8080".to_string())); + + config.grpc_endpoint = "https://localhost:8443".to_string(); + let env = build_environment(&test_sandbox(), &config); + assert!(env.contains(&"OPENSHELL_ENDPOINT=https://localhost:8443".to_string())); } #[test] @@ -231,6 +228,23 @@ fn require_sandbox_identifier_rejects_when_id_and_name_are_empty() { require_sandbox_identifier("sbx-1", "demo").expect("id and name is accepted"); } +#[test] +fn build_container_create_body_uses_host_network() { + let create_body = build_container_create_body(&test_sandbox(), &runtime_config()).unwrap(); + let host_config = create_body.host_config.expect("host_config is populated"); + + assert_eq!( + host_config.network_mode, + Some("host".to_string()), + "sandbox must use host networking so 127.0.0.1 reaches the host gateway" + ); + assert_eq!( + host_config.extra_hosts, + Some(vec!["host.openshell.internal:127.0.0.1".to_string()]), + "sandbox should expose a stable host alias without host /etc/hosts edits" + ); +} + #[test] fn build_container_create_body_uses_runtime_namespace_label() { // Regression test: the namespace label must come from the driver's diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 562d926b8..d26850ba7 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -40,9 +40,11 @@ use metrics_exporter_prometheus::PrometheusBuilder; use openshell_core::{ComputeDriverKind, Config, Error, Result}; use std::collections::HashMap; use std::io::ErrorKind; +use std::net::SocketAddr; use std::sync::{Arc, Mutex}; use std::time::Duration; use tokio::net::TcpListener; +use tokio::sync::broadcast; use tracing::{debug, error, info, warn}; use compute::{ComputeRuntime, DockerComputeConfig, VmComputeConfig}; @@ -221,12 +223,24 @@ pub async fn run_server( // Create the multiplexed service let service = MultiplexService::new(state.clone()); - // Bind the TCP listener - let listener = TcpListener::bind(config.bind_address) + // Bind the primary TCP listener plus any extras requested by drivers. + // The same multiplex service is served on each address so the CLI on + // loopback and sandboxes on a driver-supplied interface can both reach + // the gateway with identical semantics. + let mut listeners: Vec<(SocketAddr, TcpListener)> = Vec::new(); + let primary_listener = TcpListener::bind(config.bind_address) .await .map_err(|e| Error::transport(format!("failed to bind to {}: {e}", config.bind_address)))?; - info!(address = %config.bind_address, "Server listening"); + listeners.push((config.bind_address, primary_listener)); + + for extra in &config.extra_bind_addresses { + let extra_listener = TcpListener::bind(*extra) + .await + .map_err(|e| Error::transport(format!("failed to bind extra address {extra}: {e}")))?; + info!(address = %extra, "Server listening on extra address"); + listeners.push((*extra, extra_listener)); + } // Bind the unauthenticated health endpoint on a separate port when configured. if let Some(health_bind_address) = config.health_bind_address { @@ -284,21 +298,59 @@ pub async fn run_server( None }; - let shutdown = shutdown_signal(); - tokio::pin!(shutdown); + // Coordinate graceful shutdown across every listener: a single broadcast + // channel notifies all accept loops, and a `JoinSet` lets us wait for + // them to drain before returning. + let (shutdown_tx, _) = broadcast::channel::<()>(1); + let mut accept_tasks = tokio::task::JoinSet::new(); + for (addr, listener) in listeners { + let service = service.clone(); + let tls_acceptor = tls_acceptor.clone(); + let mut shutdown_rx = shutdown_tx.subscribe(); + accept_tasks.spawn(async move { + run_accept_loop(addr, listener, service, tls_acceptor, &mut shutdown_rx).await; + }); + } + + shutdown_signal().await; + info!("Shutdown signal received; stopping gateway"); + let _ = shutdown_tx.send(()); + while accept_tasks.join_next().await.is_some() {} + + state + .compute + .cleanup_on_shutdown() + .await + .map_err(|err| Error::execution(format!("gateway shutdown cleanup failed: {err}")))?; + + Ok(()) +} - // Accept connections until the gateway receives a graceful shutdown signal. +/// Drive a single listener until either the listener errors fatally or the +/// gateway receives a shutdown signal. +/// +/// All listeners share the same `MultiplexService` and (optional) TLS +/// acceptor, so callers can run multiple instances of this loop in parallel +/// to expose the gateway on more than one bind address without forking the +/// service definition. +async fn run_accept_loop( + bind_addr: SocketAddr, + listener: TcpListener, + service: MultiplexService, + tls_acceptor: Option, + shutdown_rx: &mut broadcast::Receiver<()>, +) { loop { let (stream, addr) = tokio::select! { - () = &mut shutdown => { - info!("Shutdown signal received; stopping gateway"); - break; + _ = shutdown_rx.recv() => { + debug!(bind = %bind_addr, "Listener received shutdown"); + return; } accepted = listener.accept() => { match accepted { Ok(conn) => conn, Err(e) => { - error!(error = %e, "Failed to accept connection"); + error!(error = %e, bind = %bind_addr, "Failed to accept connection"); continue; } } @@ -333,14 +385,6 @@ pub async fn run_server( }); } } - - state - .compute - .cleanup_on_shutdown() - .await - .map_err(|err| Error::execution(format!("gateway shutdown cleanup failed: {err}")))?; - - Ok(()) } async fn shutdown_signal() {