From f3049f61a8c731693dedaa9adac3a24138c3d0a7 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 30 Jan 2026 12:40:30 +0000 Subject: [PATCH] fix(cli): reject execute() immediately when child process is dead When a child process crashes and a retry is attempted on the same TaskRunProcess, execute() would hang forever because the IPC send was silently skipped and the attempt promise could never resolve. This caused runner pods to stay up indefinitely with no heartbeats. --- .changeset/fix-dead-process-execute-hang.md | 5 + .../src/executions/taskRunProcess.test.ts | 121 ++++++++++++++++++ .../cli-v3/src/executions/taskRunProcess.ts | 13 ++ 3 files changed, 139 insertions(+) create mode 100644 .changeset/fix-dead-process-execute-hang.md create mode 100644 packages/cli-v3/src/executions/taskRunProcess.test.ts diff --git a/.changeset/fix-dead-process-execute-hang.md b/.changeset/fix-dead-process-execute-hang.md new file mode 100644 index 0000000000..fa96e9c88c --- /dev/null +++ b/.changeset/fix-dead-process-execute-hang.md @@ -0,0 +1,5 @@ +--- +"trigger.dev": patch +--- + +Fix runner getting stuck indefinitely when `execute()` is called on a dead child process. diff --git a/packages/cli-v3/src/executions/taskRunProcess.test.ts b/packages/cli-v3/src/executions/taskRunProcess.test.ts new file mode 100644 index 0000000000..82ab19639b --- /dev/null +++ b/packages/cli-v3/src/executions/taskRunProcess.test.ts @@ -0,0 +1,121 @@ +import { TaskRunProcess, type TaskRunProcessOptions } from "./taskRunProcess.js"; +import { describe, it, expect, vi } from "vitest"; +import { UnexpectedExitError } from "@trigger.dev/core/v3/errors"; +import type { + TaskRunExecution, + TaskRunExecutionPayload, + WorkerManifest, + ServerBackgroundWorker, + MachinePresetResources, +} from "@trigger.dev/core/v3"; + +function createTaskRunProcessOptions( + overrides: Partial = {} +): TaskRunProcessOptions { + return { + workerManifest: { + runtime: "node", + workerEntryPoint: "/dev/null", + configEntryPoint: "/dev/null", + otelImportHook: {}, + } as unknown as WorkerManifest, + serverWorker: {} as unknown as ServerBackgroundWorker, + env: {}, + machineResources: { cpu: 1, memory: 1 } as MachinePresetResources, + ...overrides, + }; +} + +function createExecution(runId: string, attemptNumber: number): TaskRunExecution { + return { + run: { + id: runId, + payload: "{}", + payloadType: "application/json", + tags: [], + isTest: false, + createdAt: new Date(), + startedAt: new Date(), + maxAttempts: 3, + version: "1", + durationMs: 0, + costInCents: 0, + baseCostInCents: 0, + }, + attempt: { + number: attemptNumber, + startedAt: new Date(), + id: "deprecated", + backgroundWorkerId: "deprecated", + backgroundWorkerTaskId: "deprecated", + status: "deprecated" as any, + }, + task: { id: "test-task", filePath: "test.ts" }, + queue: { id: "queue-1", name: "test-queue" }, + environment: { id: "env-1", slug: "test", type: "DEVELOPMENT" }, + organization: { id: "org-1", slug: "test-org", name: "Test Org" }, + project: { id: "proj-1", ref: "proj_test", slug: "test", name: "Test" }, + machine: { name: "small-1x", cpu: 0.5, memory: 0.5, centsPerMs: 0 }, + } as unknown as TaskRunExecution; +} + +describe("TaskRunProcess", () => { + describe("execute() on a dead child process", () => { + it("should reject when child process has already exited and IPC send is skipped", async () => { + const proc = new TaskRunProcess(createTaskRunProcessOptions()); + + // Simulate a child process that has exited: _child exists but is not connected + const fakeChild = { + connected: false, + killed: false, + pid: 12345, + kill: vi.fn(), + on: vi.fn(), + stdout: { on: vi.fn() }, + stderr: { on: vi.fn() }, + }; + + // Set internal state to mimic a process whose child has crashed + (proc as any)._child = fakeChild; + (proc as any)._childPid = 12345; + (proc as any)._isBeingKilled = false; + + const execution = createExecution("run-1", 2); + + // This should NOT hang forever - it should reject promptly. + // + // BUG: Currently execute() creates a promise, skips the IPC send because + // _child.connected is false, then awaits the promise which will never + // resolve because the child is dead and #handleExit already ran. + // + // The Promise.race with a timeout detects the hang. + const result = await Promise.race([ + proc + .execute( + { + payload: { execution, traceContext: {}, metrics: [] }, + messageId: "run_run-1", + env: {}, + }, + true + ) + .then( + (v) => ({ type: "resolved" as const, value: v }), + (e) => ({ type: "rejected" as const, error: e }) + ), + new Promise<{ type: "hung" }>((resolve) => + setTimeout(() => resolve({ type: "hung" as const }), 2000) + ), + ]); + + // The test fails (proving the bug) if execute() hangs + expect(result.type).not.toBe("hung"); + expect(result.type).toBe("rejected"); + + if (result.type === "rejected") { + expect(result.error).toBeInstanceOf(UnexpectedExitError); + expect(result.error.stderr).toContain("not connected"); + } + }); + }); +}); diff --git a/packages/cli-v3/src/executions/taskRunProcess.ts b/packages/cli-v3/src/executions/taskRunProcess.ts index 098b0f261c..1e274ba02f 100644 --- a/packages/cli-v3/src/executions/taskRunProcess.ts +++ b/packages/cli-v3/src/executions/taskRunProcess.ts @@ -297,6 +297,19 @@ export class TaskRunProcess { env: params.env, isWarmStart: isWarmStart ?? this.options.isWarmStart, }); + } else { + // Child process is dead or disconnected — the IPC send was skipped so the attempt + // promise would hang forever. Reject it immediately to let the caller handle it. + this._attemptStatuses.set(key, "REJECTED"); + + // @ts-expect-error - rejecter is assigned in the promise constructor above + rejecter( + new UnexpectedExitError( + -1, + null, + "Child process is not connected, cannot execute task run" + ) + ); } const result = await promise;