From 4f3708796d53fa4eb68d52862ee216c76dc7d276 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Fri, 1 May 2026 13:10:56 +0000 Subject: [PATCH 1/3] fix(bootstrap): add no-progress timeout to image build bollard's build_image() stream relays Docker build progress, but stream.next() blocks indefinitely when the underlying builder deadlocks (commonly seen on macOS Colima with the default 2 vCPU / 2 GiB allocation). The build hangs with no error, no timeout, and openshell sandbox create waits forever for output that never arrives. Wrap each stream.next() in tokio::time::timeout (default 30 min, override via OPENSHELL_BUILD_NO_PROGRESS_TIMEOUT_SECS). On expiry, abort with a diagnostic that points at the likely root cause (under-provisioned runtime) and the docker info NCPU/MemTotal fields the user should check, plus the env var to relax the threshold for legitimate quiet builds. Signed-off-by: Tinson Lai --- crates/openshell-bootstrap/src/build.rs | 37 +++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/crates/openshell-bootstrap/src/build.rs b/crates/openshell-bootstrap/src/build.rs index fb9b4a63d..d1987f507 100644 --- a/crates/openshell-bootstrap/src/build.rs +++ b/crates/openshell-bootstrap/src/build.rs @@ -9,15 +9,28 @@ use std::collections::HashMap; use std::path::Path; +use std::time::Duration; use bollard::Docker; use bollard::query_parameters::BuildImageOptionsBuilder; use futures::StreamExt; use miette::{IntoDiagnostic, Result, WrapErr}; +use tokio::time::timeout; use crate::constants::container_name; use crate::push::push_local_images; +/// Maximum gap between Docker build stream events before a build is treated +/// as stuck. +/// +/// Total silence longer than this on under-provisioned container runtimes +/// (e.g. default Colima 2 vCPU / 2 GiB on macOS) reliably indicates a +/// deadlocked builder that will never recover. The default leaves headroom +/// for legitimately quiet steps (a single long `RUN` that produces no output) +/// — override with `OPENSHELL_BUILD_NO_PROGRESS_TIMEOUT_SECS` if a specific +/// build needs more time, or shorter for CI tightening. +const DEFAULT_BUILD_NO_PROGRESS_TIMEOUT_SECS: u64 = 1800; + /// Build a container image from a Dockerfile and push it into the gateway. /// /// This is used by `openshell sandbox create --from `. It: @@ -100,9 +113,29 @@ async fn build_image( let body = bollard::body_full(bytes::Bytes::from(context_tar)); let mut stream = docker.build_image(options, None, Some(body)); + let no_progress_secs: u64 = std::env::var("OPENSHELL_BUILD_NO_PROGRESS_TIMEOUT_SECS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(DEFAULT_BUILD_NO_PROGRESS_TIMEOUT_SECS); + let no_progress_timeout = Duration::from_secs(no_progress_secs); + + loop { + let next = match timeout(no_progress_timeout, stream.next()).await { + Ok(Some(result)) => result, + Ok(None) => break, + Err(_) => { + return Err(miette::miette!( + "Docker build produced no output for {}s. This usually means the container \ + runtime is under-provisioned (CPU/memory) and the builder has deadlocked; \ + check `docker info` (NCPU, MemTotal) and increase Colima/Docker Desktop \ + resources before retrying. If a legitimate build step is just quiet, raise \ + the threshold with OPENSHELL_BUILD_NO_PROGRESS_TIMEOUT_SECS=.", + no_progress_timeout.as_secs() + )); + } + }; - while let Some(result) = stream.next().await { - let info = result + let info = next .into_diagnostic() .wrap_err("Docker build stream error")?; From 72cabb49e413283e22c74e6ce7ad33c60d7570d3 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Fri, 1 May 2026 13:38:38 +0000 Subject: [PATCH 2/3] docs(architecture): note OPENSHELL_BUILD_NO_PROGRESS_TIMEOUT_SECS in Dockerfile build flow Signed-off-by: Tinson Lai --- architecture/sandbox-custom-containers.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/architecture/sandbox-custom-containers.md b/architecture/sandbox-custom-containers.md index 5d482ffe0..37431fd25 100644 --- a/architecture/sandbox-custom-containers.md +++ b/architecture/sandbox-custom-containers.md @@ -36,6 +36,8 @@ When `--from` points to a Dockerfile or directory, the CLI: 2. Pushes it into the cluster's containerd runtime using `docker save` / `ctr import`. 3. Creates the sandbox with the resulting image tag. +The build step aborts with a clear error if the Docker build stream stays silent for longer than `OPENSHELL_BUILD_NO_PROGRESS_TIMEOUT_SECS` seconds (default 1800). This is a guard against deadlocked container runtimes — most commonly an under-provisioned VM (e.g. macOS Colima with the default 2 vCPU / 2 GiB) where BuildKit can stop emitting events partway through a multi-step build and never recover. Raise the value if a legitimate build step is just quiet, or lower it for tighter CI budgets. + ## How It Works The supervisor binary (`openshell-sandbox`) is **always side-loaded** from the k3s node filesystem via a read-only `hostPath` volume. It is never baked into sandbox images. This applies to all sandbox pods — whether using the default community base image, a custom image, or a user-built Dockerfile. From a19bda8bbd0fc134b2206d87a3e4ddd383d20ec6 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Fri, 1 May 2026 13:40:23 +0000 Subject: [PATCH 3/3] fix(bootstrap): treat zero-valued no-progress override as unset OPENSHELL_BUILD_NO_PROGRESS_TIMEOUT_SECS=0 previously parsed cleanly into Duration::from_secs(0), making every build fail immediately with "produced no output for 0s". Filter zero alongside non-numeric values so it falls back to the default. Signed-off-by: Tinson Lai --- crates/openshell-bootstrap/src/build.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/openshell-bootstrap/src/build.rs b/crates/openshell-bootstrap/src/build.rs index d1987f507..27c05e07d 100644 --- a/crates/openshell-bootstrap/src/build.rs +++ b/crates/openshell-bootstrap/src/build.rs @@ -116,6 +116,7 @@ async fn build_image( let no_progress_secs: u64 = std::env::var("OPENSHELL_BUILD_NO_PROGRESS_TIMEOUT_SECS") .ok() .and_then(|s| s.parse().ok()) + .filter(|&n| n > 0) .unwrap_or(DEFAULT_BUILD_NO_PROGRESS_TIMEOUT_SECS); let no_progress_timeout = Duration::from_secs(no_progress_secs);