-
Notifications
You must be signed in to change notification settings - Fork 41
fix(workspace-server): diagnose and self-heal startup failures #2675
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,132 @@ | ||
| import { EventEmitter } from "node:events"; | ||
| import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; | ||
|
|
||
| vi.mock("../../utils/logger.js", () => ({ | ||
| logger: { | ||
| scope: () => ({ | ||
| info: vi.fn(), | ||
| error: vi.fn(), | ||
| warn: vi.fn(), | ||
| debug: vi.fn(), | ||
| }), | ||
| }, | ||
| })); | ||
|
|
||
| const spawnMock = vi.hoisted(() => vi.fn()); | ||
| vi.mock("node:child_process", () => ({ | ||
| spawn: spawnMock, | ||
| default: { spawn: spawnMock }, | ||
| })); | ||
|
|
||
| let nextPort = 5000; | ||
| const createServerMock = vi.hoisted(() => vi.fn()); | ||
| vi.mock("node:net", () => ({ | ||
| createServer: createServerMock, | ||
| default: { createServer: createServerMock }, | ||
| })); | ||
|
|
||
| import { WorkspaceServerService } from "./service"; | ||
|
|
||
| interface FakeChild extends EventEmitter { | ||
| stdout: EventEmitter; | ||
| stderr: EventEmitter; | ||
| kill: ReturnType<typeof vi.fn>; | ||
| pid: number; | ||
| } | ||
|
|
||
| function makeFakeChild(): FakeChild { | ||
| const child = new EventEmitter() as FakeChild; | ||
| child.stdout = new EventEmitter(); | ||
| child.stderr = new EventEmitter(); | ||
| child.kill = vi.fn(); | ||
| child.pid = 4242; | ||
| return child; | ||
| } | ||
|
|
||
| const fetchMock = vi.fn(); | ||
|
|
||
| beforeEach(() => { | ||
| vi.clearAllMocks(); | ||
| nextPort = 5000; | ||
| createServerMock.mockImplementation(() => { | ||
| const port = ++nextPort; | ||
| return { | ||
| unref: () => {}, | ||
| on: () => {}, | ||
| listen: (_p: number, _host: string, cb: () => void) => cb(), | ||
| address: () => ({ port }), | ||
| close: (cb?: () => void) => cb?.(), | ||
| }; | ||
| }); | ||
| vi.stubGlobal("fetch", fetchMock); | ||
| }); | ||
|
|
||
| afterEach(() => { | ||
| vi.unstubAllGlobals(); | ||
| }); | ||
|
|
||
| describe("WorkspaceServerService", () => { | ||
| it("resolves the connection once the child reports healthy", async () => { | ||
| const child = makeFakeChild(); | ||
| spawnMock.mockReturnValue(child); | ||
| fetchMock.mockResolvedValue({ ok: true }); | ||
|
|
||
| const service = new WorkspaceServerService(); | ||
| const conn = await service.start(); | ||
|
|
||
| expect(spawnMock).toHaveBeenCalledTimes(1); | ||
| expect(conn.url).toMatch(/^http:\/\/127\.0\.0\.1:\d+$/); | ||
| expect(conn.secret).toHaveLength(64); | ||
| }); | ||
|
|
||
| it("surfaces the child's exit code and stderr instead of an opaque timeout", async () => { | ||
| spawnMock.mockImplementation(() => { | ||
| const child = makeFakeChild(); | ||
| setTimeout(() => { | ||
| child.stderr.emit( | ||
| "data", | ||
| Buffer.from( | ||
| "[workspace-server] missing or invalid WORKSPACE_SERVER_SECRET / WORKSPACE_SERVER_PORT\n", | ||
| ), | ||
| ); | ||
| child.emit("exit", 2, null); | ||
| }, 0); | ||
| return child; | ||
| }); | ||
| // Never healthy: the failure must come from the child exit, not the timeout. | ||
| fetchMock.mockRejectedValue(new Error("ECONNREFUSED")); | ||
|
|
||
| const service = new WorkspaceServerService(); | ||
| await expect(service.start()).rejects.toThrow( | ||
| /exited before becoming healthy.*exit code 2.*WORKSPACE_SERVER_SECRET/s, | ||
| ); | ||
| // Early exit is treated as retriable (e.g. a stolen port), so we exhaust attempts. | ||
| expect(spawnMock).toHaveBeenCalledTimes(3); | ||
| }); | ||
|
|
||
| it("recovers by respawning on a fresh port after an early exit", async () => { | ||
| let serverAlive = false; | ||
| spawnMock.mockImplementation(() => { | ||
| const child = makeFakeChild(); | ||
| if (spawnMock.mock.calls.length === 1) { | ||
| setTimeout(() => { | ||
| child.stderr.emit("data", Buffer.from("Error: listen EADDRINUSE\n")); | ||
| child.emit("exit", 1, null); | ||
| }, 0); | ||
| } else { | ||
| serverAlive = true; | ||
| } | ||
| return child; | ||
| }); | ||
| fetchMock.mockImplementation(async () => { | ||
| if (!serverAlive) throw new Error("ECONNREFUSED"); | ||
| return { ok: true }; | ||
| }); | ||
|
|
||
| const service = new WorkspaceServerService(); | ||
| const conn = await service.start(); | ||
|
|
||
| expect(spawnMock).toHaveBeenCalledTimes(2); | ||
| expect(conn.secret).toHaveLength(64); | ||
| }); | ||
| }); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,8 +8,17 @@ import { injectable } from "inversify"; | |
| import { logger } from "../../utils/logger.js"; | ||
|
|
||
| const HEALTH_POLL_INTERVAL_MS = 100; | ||
| const HEALTH_POLL_TIMEOUT_MS = 5_000; | ||
| // Cold starts (slow disks, AV scanning the freshly-spawned node binary) can take | ||
| // several seconds to bind. fail-fast on early child exit means a healthy server | ||
| // still returns the moment it is reachable, so a generous ceiling only helps the | ||
| // genuinely-slow case rather than penalising the common one. | ||
| const HEALTH_POLL_TIMEOUT_MS = 15_000; | ||
| const SHUTDOWN_GRACE_MS = 3_000; | ||
| // Bounded respawn: the dominant transient failure is a port stolen between | ||
| // findFreePort() releasing it and the child re-binding, which surfaces as an | ||
| // early exit. Retrying picks a fresh port and self-heals that race. | ||
| const MAX_SPAWN_ATTEMPTS = 3; | ||
| const STDERR_CAPTURE_LIMIT = 8_192; | ||
|
|
||
| const log = logger.scope("workspace-server"); | ||
|
|
||
|
|
@@ -24,6 +33,21 @@ export interface WorkspaceServerEvents { | |
| }; | ||
| } | ||
|
|
||
| interface ChildExit { | ||
| code: number | null; | ||
| signal: NodeJS.Signals | null; | ||
| error?: Error; | ||
| } | ||
|
|
||
| class WorkspaceServerStartError extends Error { | ||
| readonly retriable: boolean; | ||
| constructor(message: string, retriable: boolean) { | ||
| super(message); | ||
| this.name = "WorkspaceServerStartError"; | ||
| this.retriable = retriable; | ||
| } | ||
| } | ||
|
|
||
| @injectable() | ||
| export class WorkspaceServerService extends TypedEventEmitter<WorkspaceServerEvents> { | ||
| private readonly scriptPath = path.join(__dirname, "workspace-server.js"); | ||
|
|
@@ -39,7 +63,7 @@ export class WorkspaceServerService extends TypedEventEmitter<WorkspaceServerEve | |
| if (this.connection) return Promise.resolve(this.connection); | ||
| if (this.pendingStart) return this.pendingStart; | ||
|
|
||
| this.pendingStart = this.spawnChild().finally(() => { | ||
| this.pendingStart = this.spawnWithRetries().finally(() => { | ||
| this.pendingStart = null; | ||
| }); | ||
| return this.pendingStart; | ||
|
|
@@ -60,6 +84,29 @@ export class WorkspaceServerService extends TypedEventEmitter<WorkspaceServerEve | |
| }, SHUTDOWN_GRACE_MS).unref(); | ||
| } | ||
|
|
||
| private async spawnWithRetries(): Promise<WorkspaceConnection> { | ||
| let lastError: Error | undefined; | ||
| for (let attempt = 1; attempt <= MAX_SPAWN_ATTEMPTS; attempt++) { | ||
| try { | ||
| return await this.spawnChild(); | ||
| } catch (err) { | ||
| lastError = err instanceof Error ? err : new Error(String(err)); | ||
| const retriable = | ||
| err instanceof WorkspaceServerStartError && | ||
| err.retriable && | ||
| attempt < MAX_SPAWN_ATTEMPTS; | ||
| log.warn("workspace-server start attempt failed", { | ||
| attempt, | ||
| maxAttempts: MAX_SPAWN_ATTEMPTS, | ||
| willRetry: retriable, | ||
| error: lastError.message, | ||
| }); | ||
| if (!retriable) break; | ||
| } | ||
| } | ||
| throw lastError ?? new Error("workspace-server failed to start"); | ||
| } | ||
|
|
||
| private async spawnChild(): Promise<WorkspaceConnection> { | ||
| const port = await findFreePort(); | ||
| const secret = randomBytes(32).toString("hex"); | ||
|
|
@@ -77,10 +124,24 @@ export class WorkspaceServerService extends TypedEventEmitter<WorkspaceServerEve | |
| }, | ||
| windowsHide: true, | ||
| }); | ||
| this.child = c; | ||
|
|
||
| // Capture a bounded tail of stderr so a child that dies during boot | ||
| // (env validation -> process.exit(2), EADDRINUSE, bundle errors) reports | ||
| // the real cause instead of an opaque timeout. | ||
| let stderrTail = ""; | ||
| const exited = new Promise<ChildExit>((resolve) => { | ||
| c.once("error", (error) => resolve({ code: null, signal: null, error })); | ||
| c.once("exit", (code, signal) => resolve({ code, signal })); | ||
| }); | ||
|
|
||
| c.stdout?.on("data", (chunk) => process.stdout.write(chunk)); | ||
| c.stderr?.on("data", (chunk) => process.stderr.write(chunk)); | ||
| c.stderr?.on("data", (chunk) => { | ||
| process.stderr.write(chunk); | ||
| stderrTail = (stderrTail + String(chunk)).slice(-STDERR_CAPTURE_LIMIT); | ||
| }); | ||
| c.once("exit", (code, signal) => { | ||
| if (this.child !== c) return; | ||
| const wasConnected = this.connection !== null; | ||
| this.child = null; | ||
| this.connection = null; | ||
|
|
@@ -90,12 +151,34 @@ export class WorkspaceServerService extends TypedEventEmitter<WorkspaceServerEve | |
| } | ||
| }); | ||
|
|
||
| this.child = c; | ||
| // Race readiness against death: stop polling the instant the child exits | ||
| // rather than waiting out the full timeout against a dead port. | ||
| const abort = new AbortController(); | ||
| const outcome = await Promise.race([ | ||
| pollHealth(url, abort.signal).then( | ||
| (ok) => ({ kind: "health", ok }) as const, | ||
| ), | ||
| exited.then((exit) => { | ||
| abort.abort(); | ||
| return { kind: "exit", exit } as const; | ||
| }), | ||
| ]); | ||
|
|
||
| if (outcome.kind === "exit") { | ||
| throw new WorkspaceServerStartError( | ||
| describeEarlyExit(outcome.exit, stderrTail), | ||
| true, | ||
| ); | ||
| } | ||
|
|
||
| if (!(await pollHealth(url))) { | ||
| if (!outcome.ok) { | ||
| this.stop(); | ||
| throw new Error( | ||
| `workspace-server failed to become healthy within ${HEALTH_POLL_TIMEOUT_MS}ms`, | ||
| const tail = stderrTail.trim(); | ||
| throw new WorkspaceServerStartError( | ||
| `workspace-server failed to become healthy within ${HEALTH_POLL_TIMEOUT_MS}ms${ | ||
| tail ? `; last stderr: ${tail}` : "" | ||
| }`, | ||
| false, | ||
| ); | ||
| } | ||
|
|
||
|
|
@@ -104,6 +187,16 @@ export class WorkspaceServerService extends TypedEventEmitter<WorkspaceServerEve | |
| } | ||
| } | ||
|
|
||
| function describeEarlyExit(exit: ChildExit, stderrTail: string): string { | ||
| const parts = ["workspace-server exited before becoming healthy"]; | ||
| if (exit.error) parts.push(`spawn error: ${exit.error.message}`); | ||
| if (exit.code !== null) parts.push(`exit code ${exit.code}`); | ||
| if (exit.signal) parts.push(`signal ${exit.signal}`); | ||
| const tail = stderrTail.trim(); | ||
| if (tail) parts.push(`stderr: ${tail}`); | ||
| return parts.join("; "); | ||
| } | ||
|
|
||
| async function findFreePort(): Promise<number> { | ||
| return new Promise((resolve, reject) => { | ||
| const s = createServer(); | ||
|
|
@@ -122,11 +215,12 @@ async function findFreePort(): Promise<number> { | |
| }); | ||
| } | ||
|
|
||
| async function pollHealth(url: string): Promise<boolean> { | ||
| async function pollHealth(url: string, signal: AbortSignal): Promise<boolean> { | ||
| const deadline = Date.now() + HEALTH_POLL_TIMEOUT_MS; | ||
| while (Date.now() < deadline) { | ||
| if (signal.aborted) return false; | ||
| try { | ||
| if ((await fetch(`${url}/health`)).ok) return true; | ||
| if ((await fetch(`${url}/health`, { signal })).ok) return true; | ||
| } catch {} | ||
| await new Promise((r) => setTimeout(r, HEALTH_POLL_INTERVAL_MS)); | ||
| } | ||
|
Comment on lines
224
to
226
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
After the child exits and Prompt To Fix With AIThis is a comment left during a code review.
Path: apps/code/src/main/services/workspace-server/service.ts
Line: 224-226
Comment:
**Abort signal is not applied to the inter-poll sleep**
After the child exits and `abort.abort()` fires, a fetch that is in-flight will throw `AbortError` (caught), but the `await new Promise((r) => setTimeout(r, HEALTH_POLL_INTERVAL_MS))` that follows is not abort-aware. This adds an unconditional 100 ms delay to every abort path before `signal.aborted` is re-checked at the top of the loop. In the fast-exit scenario this is negligible, but replacing the plain `setTimeout` with one that is cleared on abort (or checking `signal.aborted` before sleeping) would make the abort truly immediate.
How can I resolve this? If you propose a fix, please make it concise. |
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Every early child exit is thrown as
WorkspaceServerStartError(msg, true)— unconditionally retriable — which means a deterministic failure like env-validation (process.exit(2)) will exhaust all 3 attempts before surfacing the error, adding two extra port-allocation + spawn cycles and ~300 ms of delay. The self-described dominant case (port race) surfaces asEADDRINUSEand is genuinely transient, but exit code 2 explicitly means "the server will never start with this configuration" and is not port-dependent. Differentiating onexit.code === 2(or checkingexit.code !== null && exit.code !== 1) would let deterministic failures fail fast while still recovering the TOCTOU race.Prompt To Fix With AI