diff --git a/architecture/sandbox-custom-containers.md b/architecture/sandbox-custom-containers.md index 7718ac934..c668f611e 100644 --- a/architecture/sandbox-custom-containers.md +++ b/architecture/sandbox-custom-containers.md @@ -36,6 +36,8 @@ When `--from` points to a Dockerfile or directory, the CLI: 1. Builds the image locally via the Docker daemon (respecting `.dockerignore`). 2. Creates the sandbox with the resulting local image tag. +The build step aborts with a clear error if the Docker build stream stays silent for longer than `OPENSHELL_BUILD_NO_PROGRESS_TIMEOUT_SECS` seconds (default 1800). This is a guard against deadlocked container runtimes — most commonly an under-provisioned VM (e.g. macOS Colima with the default 2 vCPU / 2 GiB) where BuildKit can stop emitting events partway through a multi-step build and never recover. Raise the value if a legitimate build step is just quiet, or lower it for tighter CI budgets. + ## How It Works The supervisor binary (`openshell-sandbox`) is **always side-loaded** from the k3s node filesystem via a read-only `hostPath` volume. It is never baked into sandbox images. This applies to all sandbox pods — whether using the default community base image, a custom image, or a user-built Dockerfile. diff --git a/crates/openshell-bootstrap/src/build.rs b/crates/openshell-bootstrap/src/build.rs index a313d4394..9caeca57a 100644 --- a/crates/openshell-bootstrap/src/build.rs +++ b/crates/openshell-bootstrap/src/build.rs @@ -11,15 +11,28 @@ use std::collections::HashMap; use std::path::Path; +use std::time::Duration; use bollard::Docker; use bollard::query_parameters::BuildImageOptionsBuilder; use futures::StreamExt; use miette::{IntoDiagnostic, Result, WrapErr}; +use tokio::time::timeout; use crate::constants::container_name; use crate::push::push_local_images; +/// Maximum gap between Docker build stream events before a build is treated +/// as stuck. +/// +/// Total silence longer than this on under-provisioned container runtimes +/// (e.g. default Colima 2 vCPU / 2 GiB on macOS) reliably indicates a +/// deadlocked builder that will never recover. The default leaves headroom +/// for legitimately quiet steps (a single long `RUN` that produces no output) +/// — override with `OPENSHELL_BUILD_NO_PROGRESS_TIMEOUT_SECS` if a specific +/// build needs more time, or shorter for CI tightening. +const DEFAULT_BUILD_NO_PROGRESS_TIMEOUT_SECS: u64 = 1800; + /// Build a container image from a Dockerfile using the local Docker daemon. /// /// This is used by `openshell sandbox create --from ` for both the @@ -126,9 +139,30 @@ async fn build_image( let body = bollard::body_full(bytes::Bytes::from(context_tar)); let mut stream = docker.build_image(options, None, Some(body)); + let no_progress_secs: u64 = std::env::var("OPENSHELL_BUILD_NO_PROGRESS_TIMEOUT_SECS") + .ok() + .and_then(|s| s.parse().ok()) + .filter(|&n| n > 0) + .unwrap_or(DEFAULT_BUILD_NO_PROGRESS_TIMEOUT_SECS); + let no_progress_timeout = Duration::from_secs(no_progress_secs); + + loop { + let next = match timeout(no_progress_timeout, stream.next()).await { + Ok(Some(result)) => result, + Ok(None) => break, + Err(_) => { + return Err(miette::miette!( + "Docker build produced no output for {}s. This usually means the container \ + runtime is under-provisioned (CPU/memory) and the builder has deadlocked; \ + check `docker info` (NCPU, MemTotal) and increase Colima/Docker Desktop \ + resources before retrying. If a legitimate build step is just quiet, raise \ + the threshold with OPENSHELL_BUILD_NO_PROGRESS_TIMEOUT_SECS=.", + no_progress_timeout.as_secs() + )); + } + }; - while let Some(result) = stream.next().await { - let info = result + let info = next .into_diagnostic() .wrap_err("Docker build stream error")?;