Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 132 additions & 0 deletions apps/code/src/main/services/workspace-server/service.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import { EventEmitter } from "node:events";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";

vi.mock("../../utils/logger.js", () => ({
logger: {
scope: () => ({
info: vi.fn(),
error: vi.fn(),
warn: vi.fn(),
debug: vi.fn(),
}),
},
}));

const spawnMock = vi.hoisted(() => vi.fn());
vi.mock("node:child_process", () => ({
spawn: spawnMock,
default: { spawn: spawnMock },
}));

let nextPort = 5000;
const createServerMock = vi.hoisted(() => vi.fn());
vi.mock("node:net", () => ({
createServer: createServerMock,
default: { createServer: createServerMock },
}));

import { WorkspaceServerService } from "./service";

interface FakeChild extends EventEmitter {
stdout: EventEmitter;
stderr: EventEmitter;
kill: ReturnType<typeof vi.fn>;
pid: number;
}

function makeFakeChild(): FakeChild {
const child = new EventEmitter() as FakeChild;
child.stdout = new EventEmitter();
child.stderr = new EventEmitter();
child.kill = vi.fn();
child.pid = 4242;
return child;
}

const fetchMock = vi.fn();

beforeEach(() => {
vi.clearAllMocks();
nextPort = 5000;
createServerMock.mockImplementation(() => {
const port = ++nextPort;
return {
unref: () => {},
on: () => {},
listen: (_p: number, _host: string, cb: () => void) => cb(),
address: () => ({ port }),
close: (cb?: () => void) => cb?.(),
};
});
vi.stubGlobal("fetch", fetchMock);
});

afterEach(() => {
vi.unstubAllGlobals();
});

describe("WorkspaceServerService", () => {
it("resolves the connection once the child reports healthy", async () => {
const child = makeFakeChild();
spawnMock.mockReturnValue(child);
fetchMock.mockResolvedValue({ ok: true });

const service = new WorkspaceServerService();
const conn = await service.start();

expect(spawnMock).toHaveBeenCalledTimes(1);
expect(conn.url).toMatch(/^http:\/\/127\.0\.0\.1:\d+$/);
expect(conn.secret).toHaveLength(64);
});

it("surfaces the child's exit code and stderr instead of an opaque timeout", async () => {
spawnMock.mockImplementation(() => {
const child = makeFakeChild();
setTimeout(() => {
child.stderr.emit(
"data",
Buffer.from(
"[workspace-server] missing or invalid WORKSPACE_SERVER_SECRET / WORKSPACE_SERVER_PORT\n",
),
);
child.emit("exit", 2, null);
}, 0);
return child;
});
// Never healthy: the failure must come from the child exit, not the timeout.
fetchMock.mockRejectedValue(new Error("ECONNREFUSED"));

const service = new WorkspaceServerService();
await expect(service.start()).rejects.toThrow(
/exited before becoming healthy.*exit code 2.*WORKSPACE_SERVER_SECRET/s,
);
// Early exit is treated as retriable (e.g. a stolen port), so we exhaust attempts.
expect(spawnMock).toHaveBeenCalledTimes(3);
});

it("recovers by respawning on a fresh port after an early exit", async () => {
let serverAlive = false;
spawnMock.mockImplementation(() => {
const child = makeFakeChild();
if (spawnMock.mock.calls.length === 1) {
setTimeout(() => {
child.stderr.emit("data", Buffer.from("Error: listen EADDRINUSE\n"));
child.emit("exit", 1, null);
}, 0);
} else {
serverAlive = true;
}
return child;
});
fetchMock.mockImplementation(async () => {
if (!serverAlive) throw new Error("ECONNREFUSED");
return { ok: true };
});

const service = new WorkspaceServerService();
const conn = await service.start();

expect(spawnMock).toHaveBeenCalledTimes(2);
expect(conn.secret).toHaveLength(64);
});
});
112 changes: 103 additions & 9 deletions apps/code/src/main/services/workspace-server/service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,17 @@ import { injectable } from "inversify";
import { logger } from "../../utils/logger.js";

const HEALTH_POLL_INTERVAL_MS = 100;
const HEALTH_POLL_TIMEOUT_MS = 5_000;
// Cold starts (slow disks, AV scanning the freshly-spawned node binary) can take
// several seconds to bind. fail-fast on early child exit means a healthy server
// still returns the moment it is reachable, so a generous ceiling only helps the
// genuinely-slow case rather than penalising the common one.
const HEALTH_POLL_TIMEOUT_MS = 15_000;
const SHUTDOWN_GRACE_MS = 3_000;
// Bounded respawn: the dominant transient failure is a port stolen between
// findFreePort() releasing it and the child re-binding, which surfaces as an
// early exit. Retrying picks a fresh port and self-heals that race.
const MAX_SPAWN_ATTEMPTS = 3;
const STDERR_CAPTURE_LIMIT = 8_192;

const log = logger.scope("workspace-server");

Expand All @@ -24,6 +33,21 @@ export interface WorkspaceServerEvents {
};
}

interface ChildExit {
code: number | null;
signal: NodeJS.Signals | null;
error?: Error;
}

class WorkspaceServerStartError extends Error {
readonly retriable: boolean;
constructor(message: string, retriable: boolean) {
super(message);
this.name = "WorkspaceServerStartError";
this.retriable = retriable;
}
}

@injectable()
export class WorkspaceServerService extends TypedEventEmitter<WorkspaceServerEvents> {
private readonly scriptPath = path.join(__dirname, "workspace-server.js");
Expand All @@ -39,7 +63,7 @@ export class WorkspaceServerService extends TypedEventEmitter<WorkspaceServerEve
if (this.connection) return Promise.resolve(this.connection);
if (this.pendingStart) return this.pendingStart;

this.pendingStart = this.spawnChild().finally(() => {
this.pendingStart = this.spawnWithRetries().finally(() => {
this.pendingStart = null;
});
return this.pendingStart;
Expand All @@ -60,6 +84,29 @@ export class WorkspaceServerService extends TypedEventEmitter<WorkspaceServerEve
}, SHUTDOWN_GRACE_MS).unref();
}

private async spawnWithRetries(): Promise<WorkspaceConnection> {
let lastError: Error | undefined;
for (let attempt = 1; attempt <= MAX_SPAWN_ATTEMPTS; attempt++) {
try {
return await this.spawnChild();
} catch (err) {
lastError = err instanceof Error ? err : new Error(String(err));
const retriable =
err instanceof WorkspaceServerStartError &&
err.retriable &&
attempt < MAX_SPAWN_ATTEMPTS;
log.warn("workspace-server start attempt failed", {
attempt,
maxAttempts: MAX_SPAWN_ATTEMPTS,
willRetry: retriable,
error: lastError.message,
});
if (!retriable) break;
}
}
throw lastError ?? new Error("workspace-server failed to start");
}

private async spawnChild(): Promise<WorkspaceConnection> {
const port = await findFreePort();
const secret = randomBytes(32).toString("hex");
Expand All @@ -77,10 +124,24 @@ export class WorkspaceServerService extends TypedEventEmitter<WorkspaceServerEve
},
windowsHide: true,
});
this.child = c;

// Capture a bounded tail of stderr so a child that dies during boot
// (env validation -> process.exit(2), EADDRINUSE, bundle errors) reports
// the real cause instead of an opaque timeout.
let stderrTail = "";
const exited = new Promise<ChildExit>((resolve) => {
c.once("error", (error) => resolve({ code: null, signal: null, error }));
c.once("exit", (code, signal) => resolve({ code, signal }));
});

c.stdout?.on("data", (chunk) => process.stdout.write(chunk));
c.stderr?.on("data", (chunk) => process.stderr.write(chunk));
c.stderr?.on("data", (chunk) => {
process.stderr.write(chunk);
stderrTail = (stderrTail + String(chunk)).slice(-STDERR_CAPTURE_LIMIT);
});
c.once("exit", (code, signal) => {
if (this.child !== c) return;
const wasConnected = this.connection !== null;
this.child = null;
this.connection = null;
Expand All @@ -90,12 +151,34 @@ export class WorkspaceServerService extends TypedEventEmitter<WorkspaceServerEve
}
});

this.child = c;
// Race readiness against death: stop polling the instant the child exits
// rather than waiting out the full timeout against a dead port.
const abort = new AbortController();
const outcome = await Promise.race([
pollHealth(url, abort.signal).then(
(ok) => ({ kind: "health", ok }) as const,
),
exited.then((exit) => {
abort.abort();
return { kind: "exit", exit } as const;
}),
]);

if (outcome.kind === "exit") {
throw new WorkspaceServerStartError(
describeEarlyExit(outcome.exit, stderrTail),
true,
);
}
Comment on lines +167 to +172

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 All early exits treated as retriable regardless of exit code

Every early child exit is thrown as WorkspaceServerStartError(msg, true) — unconditionally retriable — which means a deterministic failure like env-validation (process.exit(2)) will exhaust all 3 attempts before surfacing the error, adding two extra port-allocation + spawn cycles and ~300 ms of delay. The self-described dominant case (port race) surfaces as EADDRINUSE and is genuinely transient, but exit code 2 explicitly means "the server will never start with this configuration" and is not port-dependent. Differentiating on exit.code === 2 (or checking exit.code !== null && exit.code !== 1) would let deterministic failures fail fast while still recovering the TOCTOU race.

Prompt To Fix With AI
This is a comment left during a code review.
Path: apps/code/src/main/services/workspace-server/service.ts
Line: 167-172

Comment:
**All early exits treated as retriable regardless of exit code**

Every early child exit is thrown as `WorkspaceServerStartError(msg, true)` — unconditionally retriable — which means a deterministic failure like env-validation (`process.exit(2)`) will exhaust all 3 attempts before surfacing the error, adding two extra port-allocation + spawn cycles and ~300 ms of delay. The self-described dominant case (port race) surfaces as `EADDRINUSE` and is genuinely transient, but exit code 2 explicitly means "the server will never start with this configuration" and is not port-dependent. Differentiating on `exit.code === 2` (or checking `exit.code !== null && exit.code !== 1`) would let deterministic failures fail fast while still recovering the TOCTOU race.

How can I resolve this? If you propose a fix, please make it concise.


if (!(await pollHealth(url))) {
if (!outcome.ok) {
this.stop();
throw new Error(
`workspace-server failed to become healthy within ${HEALTH_POLL_TIMEOUT_MS}ms`,
const tail = stderrTail.trim();
throw new WorkspaceServerStartError(
`workspace-server failed to become healthy within ${HEALTH_POLL_TIMEOUT_MS}ms${
tail ? `; last stderr: ${tail}` : ""
}`,
false,
);
}

Expand All @@ -104,6 +187,16 @@ export class WorkspaceServerService extends TypedEventEmitter<WorkspaceServerEve
}
}

function describeEarlyExit(exit: ChildExit, stderrTail: string): string {
const parts = ["workspace-server exited before becoming healthy"];
if (exit.error) parts.push(`spawn error: ${exit.error.message}`);
if (exit.code !== null) parts.push(`exit code ${exit.code}`);
if (exit.signal) parts.push(`signal ${exit.signal}`);
const tail = stderrTail.trim();
if (tail) parts.push(`stderr: ${tail}`);
return parts.join("; ");
}

async function findFreePort(): Promise<number> {
return new Promise((resolve, reject) => {
const s = createServer();
Expand All @@ -122,11 +215,12 @@ async function findFreePort(): Promise<number> {
});
}

async function pollHealth(url: string): Promise<boolean> {
async function pollHealth(url: string, signal: AbortSignal): Promise<boolean> {
const deadline = Date.now() + HEALTH_POLL_TIMEOUT_MS;
while (Date.now() < deadline) {
if (signal.aborted) return false;
try {
if ((await fetch(`${url}/health`)).ok) return true;
if ((await fetch(`${url}/health`, { signal })).ok) return true;
} catch {}
await new Promise((r) => setTimeout(r, HEALTH_POLL_INTERVAL_MS));
}
Comment on lines 224 to 226

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Abort signal is not applied to the inter-poll sleep

After the child exits and abort.abort() fires, a fetch that is in-flight will throw AbortError (caught), but the await new Promise((r) => setTimeout(r, HEALTH_POLL_INTERVAL_MS)) that follows is not abort-aware. This adds an unconditional 100 ms delay to every abort path before signal.aborted is re-checked at the top of the loop. In the fast-exit scenario this is negligible, but replacing the plain setTimeout with one that is cleared on abort (or checking signal.aborted before sleeping) would make the abort truly immediate.

Prompt To Fix With AI
This is a comment left during a code review.
Path: apps/code/src/main/services/workspace-server/service.ts
Line: 224-226

Comment:
**Abort signal is not applied to the inter-poll sleep**

After the child exits and `abort.abort()` fires, a fetch that is in-flight will throw `AbortError` (caught), but the `await new Promise((r) => setTimeout(r, HEALTH_POLL_INTERVAL_MS))` that follows is not abort-aware. This adds an unconditional 100 ms delay to every abort path before `signal.aborted` is re-checked at the top of the loop. In the fast-exit scenario this is negligible, but replacing the plain `setTimeout` with one that is cleared on abort (or checking `signal.aborted` before sleeping) would make the abort truly immediate.

How can I resolve this? If you propose a fix, please make it concise.

Expand Down
Loading