From ca8289b87af95ac339ac3d0fe20bbe4b200bd57f Mon Sep 17 00:00:00 2001 From: "posthog[bot]" <206114724+posthog[bot]@users.noreply.github.com> Date: Mon, 15 Jun 2026 12:46:43 +0000 Subject: [PATCH] fix(workspace-server): diagnose and self-heal startup failures spawnChild() previously swallowed every cause of a failed boot and reported a blanket "failed to become healthy within 5000ms" with no child stderr, and polled the full 5s even after the child had already died. - Capture the child's exit/error events and a bounded stderr tail, and include them in the thrown error so env-validation exits (process.exit(2)), EADDRINUSE, and bundle/spawn failures are diagnosable. - Fail fast: race readiness against child death and abort polling the instant the child exits instead of waiting out the timeout against a dead port. - Bounded respawn (3 attempts) on early exit, picking a fresh port each time, which self-heals the findFreePort() TOCTOU port-steal race. - Widen HEALTH_POLL_TIMEOUT_MS to 15s for slow cold starts; fail-fast means a healthy server still returns the moment it is reachable. Generated-By: PostHog Code Task-Id: 717fc35c-098e-4bb2-9f43-0d650a4dd635 --- .../services/workspace-server/service.test.ts | 132 ++++++++++++++++++ .../main/services/workspace-server/service.ts | 112 +++++++++++++-- 2 files changed, 235 insertions(+), 9 deletions(-) create mode 100644 apps/code/src/main/services/workspace-server/service.test.ts diff --git a/apps/code/src/main/services/workspace-server/service.test.ts b/apps/code/src/main/services/workspace-server/service.test.ts new file mode 100644 index 000000000..9ca897de0 --- /dev/null +++ b/apps/code/src/main/services/workspace-server/service.test.ts @@ -0,0 +1,132 @@ +import { EventEmitter } from "node:events"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +vi.mock("../../utils/logger.js", () => ({ + logger: { + scope: () => ({ + info: vi.fn(), + error: vi.fn(), + warn: vi.fn(), + debug: vi.fn(), + }), + }, +})); + +const spawnMock = vi.hoisted(() => vi.fn()); +vi.mock("node:child_process", () => ({ + spawn: spawnMock, + default: { spawn: spawnMock }, +})); + +let nextPort = 5000; +const createServerMock = vi.hoisted(() => vi.fn()); +vi.mock("node:net", () => ({ + createServer: createServerMock, + default: { createServer: createServerMock }, +})); + +import { WorkspaceServerService } from "./service"; + +interface FakeChild extends EventEmitter { + stdout: EventEmitter; + stderr: EventEmitter; + kill: ReturnType; + pid: number; +} + +function makeFakeChild(): FakeChild { + const child = new EventEmitter() as FakeChild; + child.stdout = new EventEmitter(); + child.stderr = new EventEmitter(); + child.kill = vi.fn(); + child.pid = 4242; + return child; +} + +const fetchMock = vi.fn(); + +beforeEach(() => { + vi.clearAllMocks(); + nextPort = 5000; + createServerMock.mockImplementation(() => { + const port = ++nextPort; + return { + unref: () => {}, + on: () => {}, + listen: (_p: number, _host: string, cb: () => void) => cb(), + address: () => ({ port }), + close: (cb?: () => void) => cb?.(), + }; + }); + vi.stubGlobal("fetch", fetchMock); +}); + +afterEach(() => { + vi.unstubAllGlobals(); +}); + +describe("WorkspaceServerService", () => { + it("resolves the connection once the child reports healthy", async () => { + const child = makeFakeChild(); + spawnMock.mockReturnValue(child); + fetchMock.mockResolvedValue({ ok: true }); + + const service = new WorkspaceServerService(); + const conn = await service.start(); + + expect(spawnMock).toHaveBeenCalledTimes(1); + expect(conn.url).toMatch(/^http:\/\/127\.0\.0\.1:\d+$/); + expect(conn.secret).toHaveLength(64); + }); + + it("surfaces the child's exit code and stderr instead of an opaque timeout", async () => { + spawnMock.mockImplementation(() => { + const child = makeFakeChild(); + setTimeout(() => { + child.stderr.emit( + "data", + Buffer.from( + "[workspace-server] missing or invalid WORKSPACE_SERVER_SECRET / WORKSPACE_SERVER_PORT\n", + ), + ); + child.emit("exit", 2, null); + }, 0); + return child; + }); + // Never healthy: the failure must come from the child exit, not the timeout. + fetchMock.mockRejectedValue(new Error("ECONNREFUSED")); + + const service = new WorkspaceServerService(); + await expect(service.start()).rejects.toThrow( + /exited before becoming healthy.*exit code 2.*WORKSPACE_SERVER_SECRET/s, + ); + // Early exit is treated as retriable (e.g. a stolen port), so we exhaust attempts. + expect(spawnMock).toHaveBeenCalledTimes(3); + }); + + it("recovers by respawning on a fresh port after an early exit", async () => { + let serverAlive = false; + spawnMock.mockImplementation(() => { + const child = makeFakeChild(); + if (spawnMock.mock.calls.length === 1) { + setTimeout(() => { + child.stderr.emit("data", Buffer.from("Error: listen EADDRINUSE\n")); + child.emit("exit", 1, null); + }, 0); + } else { + serverAlive = true; + } + return child; + }); + fetchMock.mockImplementation(async () => { + if (!serverAlive) throw new Error("ECONNREFUSED"); + return { ok: true }; + }); + + const service = new WorkspaceServerService(); + const conn = await service.start(); + + expect(spawnMock).toHaveBeenCalledTimes(2); + expect(conn.secret).toHaveLength(64); + }); +}); diff --git a/apps/code/src/main/services/workspace-server/service.ts b/apps/code/src/main/services/workspace-server/service.ts index ecbd7e402..f8f41ade4 100644 --- a/apps/code/src/main/services/workspace-server/service.ts +++ b/apps/code/src/main/services/workspace-server/service.ts @@ -8,8 +8,17 @@ import { injectable } from "inversify"; import { logger } from "../../utils/logger.js"; const HEALTH_POLL_INTERVAL_MS = 100; -const HEALTH_POLL_TIMEOUT_MS = 5_000; +// Cold starts (slow disks, AV scanning the freshly-spawned node binary) can take +// several seconds to bind. fail-fast on early child exit means a healthy server +// still returns the moment it is reachable, so a generous ceiling only helps the +// genuinely-slow case rather than penalising the common one. +const HEALTH_POLL_TIMEOUT_MS = 15_000; const SHUTDOWN_GRACE_MS = 3_000; +// Bounded respawn: the dominant transient failure is a port stolen between +// findFreePort() releasing it and the child re-binding, which surfaces as an +// early exit. Retrying picks a fresh port and self-heals that race. +const MAX_SPAWN_ATTEMPTS = 3; +const STDERR_CAPTURE_LIMIT = 8_192; const log = logger.scope("workspace-server"); @@ -24,6 +33,21 @@ export interface WorkspaceServerEvents { }; } +interface ChildExit { + code: number | null; + signal: NodeJS.Signals | null; + error?: Error; +} + +class WorkspaceServerStartError extends Error { + readonly retriable: boolean; + constructor(message: string, retriable: boolean) { + super(message); + this.name = "WorkspaceServerStartError"; + this.retriable = retriable; + } +} + @injectable() export class WorkspaceServerService extends TypedEventEmitter { private readonly scriptPath = path.join(__dirname, "workspace-server.js"); @@ -39,7 +63,7 @@ export class WorkspaceServerService extends TypedEventEmitter { + this.pendingStart = this.spawnWithRetries().finally(() => { this.pendingStart = null; }); return this.pendingStart; @@ -60,6 +84,29 @@ export class WorkspaceServerService extends TypedEventEmitter { + let lastError: Error | undefined; + for (let attempt = 1; attempt <= MAX_SPAWN_ATTEMPTS; attempt++) { + try { + return await this.spawnChild(); + } catch (err) { + lastError = err instanceof Error ? err : new Error(String(err)); + const retriable = + err instanceof WorkspaceServerStartError && + err.retriable && + attempt < MAX_SPAWN_ATTEMPTS; + log.warn("workspace-server start attempt failed", { + attempt, + maxAttempts: MAX_SPAWN_ATTEMPTS, + willRetry: retriable, + error: lastError.message, + }); + if (!retriable) break; + } + } + throw lastError ?? new Error("workspace-server failed to start"); + } + private async spawnChild(): Promise { const port = await findFreePort(); const secret = randomBytes(32).toString("hex"); @@ -77,10 +124,24 @@ export class WorkspaceServerService extends TypedEventEmitter process.exit(2), EADDRINUSE, bundle errors) reports + // the real cause instead of an opaque timeout. + let stderrTail = ""; + const exited = new Promise((resolve) => { + c.once("error", (error) => resolve({ code: null, signal: null, error })); + c.once("exit", (code, signal) => resolve({ code, signal })); + }); c.stdout?.on("data", (chunk) => process.stdout.write(chunk)); - c.stderr?.on("data", (chunk) => process.stderr.write(chunk)); + c.stderr?.on("data", (chunk) => { + process.stderr.write(chunk); + stderrTail = (stderrTail + String(chunk)).slice(-STDERR_CAPTURE_LIMIT); + }); c.once("exit", (code, signal) => { + if (this.child !== c) return; const wasConnected = this.connection !== null; this.child = null; this.connection = null; @@ -90,12 +151,34 @@ export class WorkspaceServerService extends TypedEventEmitter ({ kind: "health", ok }) as const, + ), + exited.then((exit) => { + abort.abort(); + return { kind: "exit", exit } as const; + }), + ]); + + if (outcome.kind === "exit") { + throw new WorkspaceServerStartError( + describeEarlyExit(outcome.exit, stderrTail), + true, + ); + } - if (!(await pollHealth(url))) { + if (!outcome.ok) { this.stop(); - throw new Error( - `workspace-server failed to become healthy within ${HEALTH_POLL_TIMEOUT_MS}ms`, + const tail = stderrTail.trim(); + throw new WorkspaceServerStartError( + `workspace-server failed to become healthy within ${HEALTH_POLL_TIMEOUT_MS}ms${ + tail ? `; last stderr: ${tail}` : "" + }`, + false, ); } @@ -104,6 +187,16 @@ export class WorkspaceServerService extends TypedEventEmitter { return new Promise((resolve, reject) => { const s = createServer(); @@ -122,11 +215,12 @@ async function findFreePort(): Promise { }); } -async function pollHealth(url: string): Promise { +async function pollHealth(url: string, signal: AbortSignal): Promise { const deadline = Date.now() + HEALTH_POLL_TIMEOUT_MS; while (Date.now() < deadline) { + if (signal.aborted) return false; try { - if ((await fetch(`${url}/health`)).ok) return true; + if ((await fetch(`${url}/health`, { signal })).ok) return true; } catch {} await new Promise((r) => setTimeout(r, HEALTH_POLL_INTERVAL_MS)); }