From 6934cf9b8a53113b822596d465fce7d8a9e43c76 Mon Sep 17 00:00:00 2001
From: Ruturaj-Browserstack <ruturaj.s@browserstack.com>
Date: Tue, 23 Jun 2026 21:54:55 +0530
Subject: [PATCH 01/12] =?UTF-8?q?feat(rca):=20scaffold=20plugin=20?=
 =?UTF-8?q?=E2=80=94=20manifest,=20MCP=20wiring,=20config,=20command,=20RE?=
 =?UTF-8?q?ADME?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Identity-only .claude-plugin/plugin.json; root .mcp.json wires the bstack MCP
server (stdio); config/rca.config.json centralizes all formerly-hardcoded
product/infra values (no kubectl/chitragupta/bifrost literals); /rca-build
command parses build id + mode and hands off to the skill.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .claude-plugin/plugin.json | 11 +++++++
 .env.example               |  8 +++++
 .gitignore                 |  4 +++
 .mcp.json                  | 14 ++++++++
 README.md                  | 67 ++++++++++++++++++++++++++++++++++++--
 commands/rca-build.md      | 34 +++++++++++++++++++
 config/rca.config.json     | 25 ++++++++++++++
 package.json               | 10 ++++++
 8 files changed, 171 insertions(+), 2 deletions(-)
 create mode 100644 .claude-plugin/plugin.json
 create mode 100644 .env.example
 create mode 100644 .gitignore
 create mode 100644 .mcp.json
 create mode 100644 commands/rca-build.md
 create mode 100644 config/rca.config.json
 create mode 100644 package.json

diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
new file mode 100644
index 0000000..c8c4beb
--- /dev/null
+++ b/.claude-plugin/plugin.json
@@ -0,0 +1,11 @@
+{
+  "name": "tfa-rca",
+  "description": "Drive collaborative root-cause analysis over all failed tests of a build, generic across product and infra.",
+  "version": "0.1.0",
+  "author": {
+    "name": "BrowserStack",
+    "url": "https://www.browserstack.com"
+  },
+  "homepage": "https://github.com/browserstack/browserstack-ai-tfa-demo",
+  "license": "MIT"
+}
diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..d86819e
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,8 @@
+# BrowserStack credentials — used by the bundled bstack MCP server for
+# listTestIds + tfaRcaTurn. Per-user; never commit real values.
+BROWSERSTACK_USERNAME=
+BROWSERSTACK_ACCESS_KEY=
+
+# Observability base URL the TFA RCA chat runs against. Optional —
+# the bstack MCP server defaults to its rengg-tfa staging URL when unset.
+# O11Y_TFA_RCA_BASE_URL=https://api-observability-rengg-tfa.bsstag.com
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9045f9d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+node_modules/
+.env
+# Per-run RCA batch state (the CSV/WAL spine + report) is workspace-local.
+.rca/
diff --git a/.mcp.json b/.mcp.json
new file mode 100644
index 0000000..0502929
--- /dev/null
+++ b/.mcp.json
@@ -0,0 +1,14 @@
+{
+  "mcpServers": {
+    "bstack": {
+      "type": "stdio",
+      "command": "npx",
+      "args": ["-y", "@browserstack/mcp-server"],
+      "env": {
+        "BROWSERSTACK_USERNAME": "${BROWSERSTACK_USERNAME}",
+        "BROWSERSTACK_ACCESS_KEY": "${BROWSERSTACK_ACCESS_KEY}",
+        "O11Y_TFA_RCA_BASE_URL": "${O11Y_TFA_RCA_BASE_URL}"
+      }
+    }
+  }
+}
diff --git a/README.md b/README.md
index 423d780..ff148ab 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,65 @@
-# browserstack-ai-tfa-demo
-AI TFA Demo 
+# tfa-rca — generic multi-client RCA agent plugin
+
+Drive BrowserStack's collaborative root-cause-analysis loop over **all failed
+tests of a build**, generic across product and infra, from inside an agentic
+MCP client (Claude Code / Cursor / Codex).
+
+The plugin wraps two stable MCP tools — `listTestIds` and `tfaRcaTurn` (from the
+`bstack` MCP server) — and adds the harness that batches RCA over a whole build,
+clusters failures by signature, routes evidence requests to whatever
+skills/tools the client already has, and writes a per-test RCA into the TRA
+dashboard.
+
+> It **discovers and delegates** to the infra skills/tools already in your
+> client (GitHub, k8s/EKS, kibana/other logs, metrics). It does **not** install
+> or own those connectors.
+
+## Install
+
+```bash
+git clone https://github.com/browserstack/browserstack-ai-tfa-demo.git
+cd browserstack-ai-tfa-demo
+cp .env.example .env   # fill in BROWSERSTACK_USERNAME / BROWSERSTACK_ACCESS_KEY
+claude --plugin-dir ./
+```
+
+The plugin auto-configures on load: the `bstack` MCP server (from `.mcp.json`),
+the `/rca-build` command, the `rca-build` skill, and the `ai-tfa-coordinator`
+agent are all discovered by convention.
+
+## Usage
+
+```
+/rca-build <build-id>
+/rca-build build_id=<id> mode=auto
+```
+
+On start the plugin runs a **mandatory pre-flight intake** asking for your
+product + automation repos, working branch, default branch, and the PRs in
+play, plus the build id. Every question is answerable with "I don't have one" →
+the run proceeds RCA-only.
+
+## Modes
+
+- **auto** — a dynamic workflow drives the whole batch (5 tests concurrent), no
+  mid-run prompts. When evidence can't be gathered (no matching skill), it
+  reports "unavailable" back to the TFA agent, which finalizes best-effort.
+- **interactive** — the main session spawns subagents (5 at a time); on an
+  evidence gap a subagent returns the gap to the main agent, which asks you,
+  then feeds the answer back.
+
+`auto` means autonomy *during* the batch from an interactive session — not
+headless. Running `claude -p` with a required input missing ends immediately.
+
+## Requirements
+
+- The `bstack` MCP server (bundled via `.mcp.json`).
+- Credentials in `.env` (or your client's MCP env).
+- For full evidence coverage: whatever GitHub / infra / logging / metrics
+  skills your client already has. Missing ones degrade gracefully (the RCA's
+  confidence band reflects what evidence was actually available).
+
+## Layout
+
+See `docs/plans/2026-06-23-001-feat-generic-rca-agent-plugin-plan.md` for the
+implementation plan and `docs/brainstorms/` for the requirements.
diff --git a/commands/rca-build.md b/commands/rca-build.md
new file mode 100644
index 0000000..7a7a829
--- /dev/null
+++ b/commands/rca-build.md
@@ -0,0 +1,34 @@
+---
+description: Run collaborative RCA over all failed tests of a BrowserStack build
+---
+
+# /rca-build
+
+Entry point for the generic RCA harness. Drives a collaborative root-cause
+analysis loop over **every failed test** of a build, generic across product and
+infra.
+
+## Input
+
+`$ARGUMENTS` carries the build id (and optional flags). Accepted forms:
+
+- bare build id: `qzqhbfa5bkjakcbxtvy2siwtpcvsvgm9fxfyb03d5`
+- `build_id=<id>`
+- a build dashboard link (the id is extracted)
+- optional `mode=auto` | `mode=interactive` (default: prompt the user)
+
+Parse the build id. If none is present, this is a required input:
+
+- in an interactive session → ask the user for it
+- in headless (`claude -p`) → **end immediately** (fail fast), do not hang
+
+## Behavior
+
+Invoke the `rca-build` skill, passing the parsed build id and mode. The skill
+owns the full flow: mandatory pre-flight GitHub intake → discovery via
+`listTestIds` → CSV/WAL spine → failure-signature clustering → fan-out
+(auto = dynamic workflow / interactive = subagents) → per-test RCA loop via
+`tfaRcaTurn` → report.
+
+Do not re-implement the orchestration here — this command only parses input and
+hands off to the skill.
diff --git a/config/rca.config.json b/config/rca.config.json
new file mode 100644
index 0000000..b7633bb
--- /dev/null
+++ b/config/rca.config.json
@@ -0,0 +1,25 @@
+{
+  "$comment": "Central config for the generic RCA harness. All formerly-hardcoded product/infra values live here. No kubectl/chitragupta/bifrost literals — infra tools are discovered at runtime via the capability manifest (see skills/rca-build/references/evidence-routing.md).",
+  "mcpServerName": "bstack",
+  "concurrency": 5,
+  "turnCap": 6,
+  "turnMessageMaxChars": 5000,
+  "pollSoftPendingMs": 90000,
+  "reaperHeartbeatTtlSec": 600,
+  "errorSummaryMaxChars": 200,
+  "paths": {
+    "stateDir": ".rca",
+    "csvFile": ".rca/rca-state.csv",
+    "reportFile": ".rca/rca-report.md"
+  },
+  "evidenceRouting": {
+    "test_logs": { "owner": "tfa", "skip": true },
+    "product_code": { "capability": "github", "discoveryHints": ["github-mcp", "gh"] },
+    "deploy": { "capability": "github", "discoveryHints": ["github-mcp", "gh"] },
+    "ci": { "capability": "github", "discoveryHints": ["github-mcp", "gh"] },
+    "k8s": { "capability": "k8s", "discoveryHints": [] },
+    "kibana": { "capability": "logs", "discoveryHints": [] },
+    "metrics": { "capability": "metrics", "discoveryHints": [] },
+    "other": { "capability": "other", "discoveryHints": [] }
+  }
+}
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..c11e40f
--- /dev/null
+++ b/package.json
@@ -0,0 +1,10 @@
+{
+  "name": "tfa-rca-plugin",
+  "version": "0.1.0",
+  "private": true,
+  "type": "module",
+  "description": "Generic multi-client RCA agent plugin harness",
+  "scripts": {
+    "test": "node --test tests/"
+  }
+}

From f0d5cf63e017a5669764a7f59446d16a59550d0b Mon Sep 17 00:00:00 2001
From: Ruturaj-Browserstack <ruturaj.s@browserstack.com>
Date: Tue, 23 Jun 2026 21:58:47 +0530
Subject: [PATCH 02/12] feat(rca): generic per-test RCA coordinator +
 evidence-routing registry

Port the obs-tfa-rca loop decoupled: ai-tfa-coordinator drives tfaRcaTurn to a
terminal RCA (turn-cap, one-thread, soft-PENDING, digest-not-dump) with the
gather mechanism routed by capability (no kubectl/chitragupta/bifrost literals).
lib/routing.mjs classifies each ask skip/gather/gap against the config registry
+ capability manifest; the gap action is the only mode fork (auto=unavailable,
interactive=ask-user). references/evidence-routing.md carries the digest format
and size caps verbatim. Adds sibling pre-seed one-turn-confirm hook.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 agents/ai-tfa-coordinator.md                  | 185 ++++++++++++++++++
 lib/routing.mjs                               |  75 +++++++
 package.json                                  |   2 +-
 .../rca-build/references/evidence-routing.md  | 133 +++++++++++++
 tests/routing.test.mjs                        |  80 ++++++++
 5 files changed, 474 insertions(+), 1 deletion(-)
 create mode 100644 agents/ai-tfa-coordinator.md
 create mode 100644 lib/routing.mjs
 create mode 100644 skills/rca-build/references/evidence-routing.md
 create mode 100644 tests/routing.test.mjs

diff --git a/agents/ai-tfa-coordinator.md b/agents/ai-tfa-coordinator.md
new file mode 100644
index 0000000..e2045fb
--- /dev/null
+++ b/agents/ai-tfa-coordinator.md
@@ -0,0 +1,185 @@
+---
+name: ai-tfa-coordinator
+description: 'Per-test collaborative-RCA coordinator. Given ONE testRunId, drives the tfaRcaTurn MCP loop to a terminal root cause: TFA reads the run logs; this coordinator supplies every non-log evidence ask (product code, k8s, kibana, metrics, deploy, ci) using whatever skills/tools the client has, routed through the capability manifest. Skips every test_logs ask (TFA owns logs). Emits a structured RCA_OUTPUT block. Generic over product and infra — no hardcoded tools. Examples:
+- orchestrator: Agent(subagent_type="ai-tfa-coordinator", prompt="RCA testRunId=39 — error: empty buildName rejected on POST /builds") → drives the loop, returns RCA_OUTPUT
+- sibling confirm: Agent(subagent_type="ai-tfa-coordinator", prompt="RCA testRunId=40 — pre-seed: cause=<rep root cause>, suspect PR=#7421") → one-turn confirm against this test logs
+- user: "run collaborative RCA on test run 39" → single-test loop to RESOLVED/BLOCKED/PENDING'
+tools: [Bash, Read, Grep, Glob, Task, mcp__*__tfaRcaTurn, mcp__github__*]
+model: sonnet
+---
+
+# Per-Test Collaborative RCA Coordinator (`ai-tfa-coordinator`)
+
+Drives the `tfaRcaTurn` MCP loop for a **single** failed test to a terminal RCA.
+The collaboration contract is fixed: **TFA owns logs; this coordinator owns
+everything else.** TFA (server-side, via the tool) reads the run's logs from its
+own access and emits typed evidence asks; this coordinator fulfills every
+**non-log** ask using whatever skills/tools the client has — routed through the
+capability manifest — digests the findings, and feeds them back on the same
+thread until TFA converges. TFA authors the RCA into the TRA dashboard.
+
+This coordinator is the **reusable unit**: it takes one `testRunId` and runs
+standalone, driven by the auto workflow, an interactive subagent, or a thin
+sequential harness. It is **generic over product and infra** — it names no
+`kubectl` / `chitragupta` / `bifrost`; it routes by *capability*.
+
+## Inputs
+
+- `testRunId` — **required**, the integer test-run ID. Maps to the tool's `testRunId` arg.
+- `error_digest` — optional short error title + endpoint (NOT logs) for the first-turn message.
+- `pre_seed` — optional. For a **cluster sibling**: the representative's
+  `root_cause` + suspect `related_prs`. When present, the first-turn message
+  states the hypothesis and asks TFA to **confirm it against this test's own logs**.
+- `resume` — optional `{ threadId, turnId }` from a prior PENDING run.
+- `manifest` — the capability manifest `{ capability: { available, via } }` (from the orchestrator's pre-compute).
+- `mode` — `auto` | `interactive`. Selects the **gap-resolver** (see below).
+
+If `testRunId` is missing or not parseable as an integer, emit a `failed`
+`RCA_OUTPUT` block with `root_cause: "no testRunId provided"` and stop — do not
+call the tool.
+
+## Operating principles
+
+1. **Logs by TFA — the core contract.** Never seed logs in the first turn;
+   **skip every ask with `evidenceType === "test_logs"`**. Never fetch, paste, or
+   digest log content. Logs are TFA's job.
+2. **Read-only.** Every gather mechanism is read-only. Never write to a repo,
+   cluster, ticket, or the run. Produce a block and stop.
+3. **Turn-cap** = `turnCap` from `config/rca.config.json` (default 6). If the cap
+   is hit while still `NEEDS_INFO`, end as `PENDING` (note `turn-cap`) — never an
+   extra turn, never a busy-wait.
+4. **One thread per test.** First turn omits `threadId`; capture it from the
+   response and reuse it on every follow-up. Never start a second thread.
+5. **Soft-PENDING ends the loop.** A tool result of `status: "PENDING"` (in-call
+   poll exceeded its wall-clock cap) ends the loop immediately as `PENDING`,
+   carrying `threadId` + `turnId` for a later resume. Do not re-poll or sleep.
+6. **Digest, don't dump.** Every follow-up `message` carries digested findings
+   (`ask → found → snippet/link`), never raw log tails, full diffs, or full files.
+   Size caps + block shape live in `references/evidence-routing.md` — read it
+   before fulfilling any ask. The tool caps `message` at 5000 chars.
+7. **Report gaps, don't drop them.** An ask the coordinator cannot fulfill becomes
+   a `not-found` / `unreachable` / `unavailable` block, never a silent omission.
+8. **Never editorialize.** Report findings (suspect PR, server-side error line),
+   not verdicts. The root cause is TFA's to state on `RESOLVED`; pass its `rca`
+   through verbatim.
+
+## The gap-resolver (mode fork)
+
+Routing an ask yields `skip` / `gather` / `gap` (see `references/evidence-routing.md`).
+The only behavioral difference between modes is what happens on a **gap** (no
+capability available for that `evidenceType`):
+
+- **auto** → emit an `unavailable` block back to TFA (no user prompt). TFA
+  finalizes best-effort with lower confidence.
+- **interactive** → **return the gap to the caller** (the main agent), which asks
+  the user (A1) for that data, then feeds the answer back. A subagent cannot
+  prompt the user itself.
+
+Everything else — the loop, routing, digest, caps, output — is identical across
+modes. Do not fork the loop; only the gap action differs.
+
+## The loop
+
+```
+0. Parse inputs → testRunId (int). Build the first-turn DIGEST:
+     - pre_seed present → "Hypothesis from cluster representative: <cause>.
+        Suspect PR(s): <related_prs>. Confirm against THIS test's logs." (NO logs)
+     - error_digest present → "Error: <title + endpoint>" (NO logs, NO threadId)
+     - neither → "Initiating collaborative RCA for test run <id>."
+1. SUBMIT turn 1: tfaRcaTurn(testRunId=<id>, message=<digest>). Capture threadId. turns_used = 1.
+   (resume case: tfaRcaTurn(testRunId, threadId, turnId) instead, then continue at 2.)
+2. CLASSIFY result.status:
+     RESOLVED   → capture rca; END (RESOLVED).
+     BLOCKED    → capture reason + unmetAsks; END (BLOCKED).
+     PENDING    → capture threadId + turnId; END (PENDING, note "soft-pending").
+     NEEDS_INFO → go to 3.
+3. ROUTE the asks (read references/evidence-routing.md; route via lib/routing.mjs):
+     For each ask, high → medium → low:
+       skip   → record in asks_skipped, emit nothing.
+       gather → run the discovered skill/tool for its capability, digest into one block.
+                Record evidenceType in asks_fulfilled (dedupe).
+       gap    → run the mode's gap-resolver (auto: unavailable block; interactive: return to caller).
+     Concatenate per-ask blocks into the next-turn MESSAGE (respect size caps).
+4. SUBMIT follow-up on the SAME thread: tfaRcaTurn(testRunId, message, threadId). turns_used += 1.
+5. TURN-CAP CHECK: if turns_used >= turnCap and still NEEDS_INFO → END (PENDING, "turn-cap").
+     else → go to 2 with the new result.
+6. EMIT the RCA_OUTPUT block from the captured terminal state.
+```
+
+**Sibling confirm (cluster member).** When `pre_seed` is present the first turn
+states the representative's hypothesis and asks TFA to confirm against this
+test's own logs. If TFA `RESOLVED`s in one turn → a logs-grounded per-test RCA at
+minimal cost. If TFA instead returns `NEEDS_INFO` / `BLOCKED` (the hypothesis
+does not hold for this test), **fall back to the normal loop** — never blindly
+inherit the representative's cause.
+
+## Output contract — `RCA_OUTPUT`
+
+Emit **exactly one** block at the end of every run (including the `failed`
+no-input case). The orchestrator parses it into one CSV row / report record.
+
+```
+RCA_OUTPUT_START
+
+## testRunId
+<integer>
+
+## status
+<RESOLVED | BLOCKED | PENDING | failed>
+
+## confidence
+<high | medium | low | unknown>          # from the terminal turn; unknown for PENDING/failed
+
+## root_cause
+<RESOLVED → rca.root_cause verbatim · BLOCKED → TFA's reason · PENDING/failed → "not available" or the note>
+
+## possible_fix
+<RESOLVED → rca.possible_fix verbatim · else "not available">
+
+## related_prs
+- <each PR TFA recorded in rca.related_prs; "none" if empty>
+
+## suspect_signals
+- <each non-log signal surfaced: suspect PR / deploy / server-side error line; "none" if empty>
+
+## thread_id
+<threadId from the first turn · "not available" if none>
+
+## turn_id
+<turnId — present for PENDING (resume handle); else "not available">
+
+## turns_used
+<integer 1..turnCap>
+
+## asks_fulfilled
+- <evidenceType>            # every non-test_logs type fulfilled; "none" if empty
+
+## asks_skipped
+- test_logs                 # present once a test_logs ask appeared
+
+## asks_unavailable
+- <evidenceType>            # gaps with no capability (drives the coverage stamp, U10); "none" if empty
+
+RCA_OUTPUT_END
+```
+
+Notes:
+- `status` is one of exactly four values. `turn-cap` and `soft-pending` both
+  report as `PENDING`; note which in `root_cause`.
+- `asks_skipped` always includes `test_logs` whenever TFA asked for logs.
+  `asks_fulfilled` **never** includes `test_logs`.
+- `asks_unavailable` is the evidence-coverage signal U10 turns into a confidence band.
+- `failed` is the no-parseable-result / no-input case; the orchestrator
+  synthesizes a `failed` row if this coordinator dies — keep the block valid.
+
+## Hard limits
+
+- **Never** fulfill or seed a `test_logs` ask — TFA owns logs.
+- **Never** exceed `turnCap` `tfaRcaTurn` calls in one run.
+- **Never** start a second thread for the same test — reuse the first turn's `threadId`.
+- **Never** busy-wait / re-poll on a soft-`PENDING` — end and report it resumable.
+- **Never** dump raw logs, full diffs, or full file contents into a turn message — digest only.
+- **Never** write to any repo / cluster / ticket / the run — every action is read-only.
+- **Never** editorialize a cause — pass TFA's `rca` through verbatim.
+- **Never** blindly inherit a representative's cause for a sibling — confirm against its own logs.
+- **Always** emit exactly one valid `RCA_OUTPUT` block, even on the `failed` path.
diff --git a/lib/routing.mjs b/lib/routing.mjs
new file mode 100644
index 0000000..291738e
--- /dev/null
+++ b/lib/routing.mjs
@@ -0,0 +1,75 @@
+// Evidence-routing registry (D3). Maps a TFA `ask.evidenceType` onto an
+// action, given the run's capability manifest. Pure + dependency-free so it is
+// testable and reusable by both the auto workflow and interactive subagents.
+//
+// `test_logs` is the TFA agent's own evidence and is always skipped. Every
+// other type routes to a capability; whether that capability is *available* is
+// decided by the manifest (built once per run — see U6 / buildManifest).
+
+import { readFileSync } from "node:fs";
+
+export const TEST_LOGS = "test_logs";
+
+const PRIORITY_RANK = { high: 0, medium: 1, low: 2 };
+
+// Load and parse config/rca.config.json from an absolute or cwd-relative path.
+export function loadConfig(configPath) {
+  return JSON.parse(readFileSync(configPath, "utf8"));
+}
+
+// Order a turn's asks high → medium → low (unknown priority sorts last).
+export function orderAsks(asks = []) {
+  return [...asks].sort(
+    (a, b) =>
+      (PRIORITY_RANK[a?.priority] ?? 99) - (PRIORITY_RANK[b?.priority] ?? 99),
+  );
+}
+
+// Classify one ask. Returns one of:
+//   { action: "skip",   ... }  — test_logs / TFA-owned; the coordinator emits nothing
+//   { action: "gather", ... }  — a capability is available; gather + digest
+//   { action: "gap",    ... }  — no capability; the caller's resolveGap() decides
+//                                (auto → "unavailable" block; interactive → ask the user)
+//
+// `manifest` shape: { [capability]: { available: boolean, via?: string } }.
+export function routeAsk(ask, config, manifest = {}) {
+  const evidenceType = ask?.evidenceType ?? "other";
+  const routing = config?.evidenceRouting ?? {};
+  const entry = routing[evidenceType] ?? routing.other ?? { capability: "other" };
+
+  if (entry.skip || entry.owner === "tfa") {
+    return { evidenceType, action: "skip", reason: "tfa-owned" };
+  }
+
+  const capability = entry.capability ?? "other";
+  const cap = manifest[capability];
+  if (cap && cap.available) {
+    return {
+      evidenceType,
+      action: "gather",
+      capability,
+      via: cap.via ?? null,
+    };
+  }
+
+  return {
+    evidenceType,
+    action: "gap",
+    capability,
+    discoveryHints: entry.discoveryHints ?? [],
+    reason: "no-capability",
+  };
+}
+
+// Split a turn's asks into the three buckets, in priority order. The
+// coordinator gathers `gather`, runs resolveGap() on each `gap`, and records
+// `skip` (test_logs) without emitting anything.
+export function routeAsks(asks, config, manifest = {}) {
+  const ordered = orderAsks(asks);
+  const buckets = { skip: [], gather: [], gap: [] };
+  for (const ask of ordered) {
+    const routed = routeAsk(ask, config, manifest);
+    buckets[routed.action].push({ ask, ...routed });
+  }
+  return buckets;
+}
diff --git a/package.json b/package.json
index c11e40f..27344a7 100644
--- a/package.json
+++ b/package.json
@@ -5,6 +5,6 @@
   "type": "module",
   "description": "Generic multi-client RCA agent plugin harness",
   "scripts": {
-    "test": "node --test tests/"
+    "test": "node --test"
   }
 }
diff --git a/skills/rca-build/references/evidence-routing.md b/skills/rca-build/references/evidence-routing.md
new file mode 100644
index 0000000..e6cc4d0
--- /dev/null
+++ b/skills/rca-build/references/evidence-routing.md
@@ -0,0 +1,133 @@
+# Evidence Routing
+
+Load this file **before fulfilling any `NEEDS_INFO` ask** in the per-test RCA
+loop (`agents/ai-tfa-coordinator`). It maps each TFA `evidenceType` to a
+**capability** (not a hardcoded tool), and defines the **digest** the coordinator
+submits on the next turn.
+
+The core contract: **TFA owns logs; the client agent owns everything else.** The
+coordinator never seeds logs and never fulfills a `test_logs` ask. Every other
+`evidenceType` routes to a capability that is gathered via **whatever skill/tool
+the client actually has** for it (discovered once into the capability manifest —
+see `SKILL.md` § Pre-compute). There are **no `kubectl` / `chitragupta` /
+`bifrost` literals here** — that is the whole point of going generic.
+
+The registry logic lives in `lib/routing.mjs` (`routeAsk` / `routeAsks`); this
+file is the human/agent-facing contract for the digest and the size caps.
+
+---
+
+## How a turn's asks are processed
+
+A `NEEDS_INFO` turn returns `asks: TfaAsk[]`, each `{ what, why, evidenceType,
+priority }`. For each ask, in descending `priority` (`high` → `medium` → `low`):
+
+1. Route the `evidenceType` (via `lib/routing.mjs` → the config registry +
+   capability manifest). The result is one of three actions:
+   - **skip** — `test_logs` (TFA-owned). Gather nothing; record in `asks_skipped`.
+   - **gather** — a capability is available. Run its discovered skill/tool scoped
+     by `what` / `why`, then digest the result into one ask block.
+   - **gap** — no capability is available. Hand the ask to the injected
+     **`resolveGap()`** policy:
+     - **auto mode** → emit an `unavailable` block back to TFA (no user prompt).
+     - **interactive mode** → return the gap to the main agent, which asks the
+       user, then feeds the answer back.
+2. Concatenate the per-ask blocks into the next-turn `message` and resubmit on
+   the same `threadId`.
+
+An ask that cannot be fulfilled is **never silently dropped** — it becomes a
+`not-found` / `unreachable` / `unavailable` block so TFA can reason about the gap.
+
+---
+
+## Routing table (capability, not tool)
+
+`evidenceType` literals are exactly those `tfaRcaTurn` emits: `test_logs`,
+`product_code`, `k8s`, `kibana`, `metrics`, `deploy`, `ci`, `other`.
+
+| `evidenceType` | Capability | Gathered via (discovered at runtime) |
+|---|---|---|
+| `test_logs` | — (TFA, skip) | never gathered; TFA self-serves from its own log access |
+| `product_code` | `github` | the client's GitHub capability — **GitHub MCP if present, else `gh`** (see `references/github-evidence.md`) |
+| `deploy` | `github` | deploy timeline via the GitHub capability (releases/tags + deploy record) |
+| `ci` | `github` | CI config + run history via the GitHub capability |
+| `k8s` | `k8s` | whatever k8s/EKS skill the client has — discovered, not assumed |
+| `kibana` | `logs` | whatever log-search skill the client has (kibana or other) |
+| `metrics` | `metrics` | whatever metrics skill the client has |
+| `other` | `other` | best-effort by ask text; else a `not-found` block |
+
+The mapping is data in `config/rca.config.json` (`evidenceRouting`), so a
+different deployment can remap `evidenceType → capability` without code changes.
+
+**Deployment-state guard:** a suspect PR only matters if its code was actually
+live in the run's env at the failure window. If you can cheaply confirm it was
+not deployed / behind an OFF flag, say so in the digest rather than feeding TFA a
+suspect that could not have caused the failure. (Full protocol: U9 /
+`references/github-evidence.md`.)
+
+---
+
+## Digest format
+
+The single most important discipline: **digested input, not raw dumps.** Every
+turn's `message` loads into the agent's context *and* is sent to TFA; a raw log
+tail or full PR diff blows both budgets and degrades TFA's reasoning. Supply the
+*findings*, not the *haystack*.
+
+### Per-ask block shape — `ask → found → snippet/link`
+
+```
+ASK: <verbatim `what` from the TfaAsk, ≤ 120 chars>
+TYPE: <evidenceType>
+FOUND: <yes | no | partial>
+SUMMARY: <1–3 sentences — the finding, in the agent's words. ≤ 400 chars>
+SNIPPET:
+  <the load-bearing excerpt only — see size caps. Omit if a LINK fully carries it.>
+LINK: <permalink to the source — PR/commit/log-search/metrics panel/deploy record. Omit if N/A.>
+```
+
+- `SUMMARY` is the answer. `SNIPPET` is the *minimum* evidence backing it. `LINK`
+  lets TFA (or a human) verify without the bytes living in the message.
+- Prefer **LINK over SNIPPET** whenever a permalink fully carries the evidence.
+
+### Size caps (hard ceilings — truncate, never exceed)
+
+| Field / scope | Soft target | Hard ceiling | On exceed |
+|---|---|---|---|
+| `SUMMARY` | ≤ 300 chars | 400 chars | Tighten to the finding; drop restatement of the ask |
+| `SNIPPET` per ask | ≤ 20 lines | 40 lines | Keep the load-bearing lines; replace the rest with `… (N lines elided — see LINK)` |
+| Code diff in a `product_code` snippet | ≤ 1 hunk | 3 hunks | Show changed lines + 3 lines context; link the full PR |
+| Whole next-turn `message` | ≤ 200 lines | 400 lines (and ≤ `turnMessageMaxChars`) | Drop `low`-priority asks first; keep every `high` ask's block |
+| Asks fulfilled per turn | all `high` + `medium` | — | Defer `low` asks to a later turn rather than truncating a `high` ask |
+
+Truncation rule of thumb: **never truncate a `high`-priority ask's block to fit a
+`low`-priority one.** Drop the low block whole; keep the high block intact. The
+whole-message ceiling also honors `turnMessageMaxChars` from
+`config/rca.config.json` (the tool caps `message` at 5000 chars).
+
+### What never goes in a digest
+
+- Raw log tails, full log output, full file contents, full PR diffs — link or excerpt.
+- `test_logs` content of any kind (TFA owns it).
+- Credentials, tokens, internal hostnames, or any secret surfaced by an env/secret dump.
+- Speculation dressed as a finding. If `FOUND: no`, say what was checked; do not invent a cause.
+
+---
+
+## Unfulfillable asks — report, don't drop
+
+```
+ASK: <verbatim what>
+TYPE: <evidenceType>
+FOUND: no
+SUMMARY: not-found | unreachable | unavailable | out-of-scope — <one line: what was checked or why blocked>
+```
+
+- `not-found` — the skill/tool ran but the signal isn't there. State the search performed.
+- `unreachable` — the surface was not reachable from this agent context. State which.
+- `unavailable` — no capability/skill exists for this `evidenceType` (auto-mode gap result).
+- `out-of-scope` — the ask is `test_logs` or otherwise not the agent's to fulfill.
+
+An all-`unavailable` / all-`not-found` turn still resubmits — TFA decides whether
+the gap is fatal (→ BLOCKED) or it can converge anyway (best-effort, lower
+confidence). The coordinator does not pre-empt that decision.
diff --git a/tests/routing.test.mjs b/tests/routing.test.mjs
new file mode 100644
index 0000000..c63b595
--- /dev/null
+++ b/tests/routing.test.mjs
@@ -0,0 +1,80 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { routeAsk, routeAsks, orderAsks, TEST_LOGS } from "../lib/routing.mjs";
+
+const CONFIG = {
+  evidenceRouting: {
+    test_logs: { owner: "tfa", skip: true },
+    product_code: { capability: "github", discoveryHints: ["github-mcp", "gh"] },
+    k8s: { capability: "k8s", discoveryHints: [] },
+    other: { capability: "other", discoveryHints: [] },
+  },
+};
+
+test("test_logs is always skipped (TFA-owned)", () => {
+  const r = routeAsk({ evidenceType: TEST_LOGS, priority: "high" }, CONFIG, {
+    github: { available: true },
+  });
+  assert.equal(r.action, "skip");
+  assert.equal(r.reason, "tfa-owned");
+});
+
+test("available capability → gather, carrying via", () => {
+  const r = routeAsk({ evidenceType: "product_code", priority: "high" }, CONFIG, {
+    github: { available: true, via: "github-mcp" },
+  });
+  assert.equal(r.action, "gather");
+  assert.equal(r.capability, "github");
+  assert.equal(r.via, "github-mcp");
+});
+
+test("unavailable capability → gap, carrying discovery hints", () => {
+  const r = routeAsk({ evidenceType: "k8s", priority: "medium" }, CONFIG, {
+    k8s: { available: false },
+  });
+  assert.equal(r.action, "gap");
+  assert.equal(r.capability, "k8s");
+  assert.equal(r.reason, "no-capability");
+});
+
+test("capability absent from manifest entirely → gap", () => {
+  const r = routeAsk({ evidenceType: "k8s", priority: "low" }, CONFIG, {});
+  assert.equal(r.action, "gap");
+});
+
+test("unknown evidenceType falls back to the 'other' entry", () => {
+  const r = routeAsk({ evidenceType: "weird", priority: "low" }, CONFIG, {
+    other: { available: true, via: "best-effort" },
+  });
+  assert.equal(r.action, "gather");
+  assert.equal(r.capability, "other");
+});
+
+test("orderAsks sorts high → medium → low, unknown last", () => {
+  const ordered = orderAsks([
+    { what: "c", priority: "low" },
+    { what: "a", priority: "high" },
+    { what: "d", priority: undefined },
+    { what: "b", priority: "medium" },
+  ]);
+  assert.deepEqual(
+    ordered.map((a) => a.what),
+    ["a", "b", "c", "d"],
+  );
+});
+
+test("routeAsks buckets a mixed turn in priority order", () => {
+  const buckets = routeAsks(
+    [
+      { evidenceType: "k8s", priority: "low" },
+      { evidenceType: "test_logs", priority: "high" },
+      { evidenceType: "product_code", priority: "high" },
+    ],
+    CONFIG,
+    { github: { available: true, via: "gh" } },
+  );
+  assert.equal(buckets.skip.length, 1);
+  assert.equal(buckets.gather.length, 1);
+  assert.equal(buckets.gap.length, 1);
+  assert.equal(buckets.gather[0].evidenceType, "product_code");
+});

From cb0d8f6d1408267f06619893c6202891c362b648 Mon Sep 17 00:00:00 2001
From: Ruturaj-Browserstack <ruturaj.s@browserstack.com>
Date: Tue, 23 Jun 2026 22:01:27 +0530
Subject: [PATCH 03/12] feat(rca): pre-flight intake, discovery, CSV/WAL spine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SKILL.md orchestrator spec: mandatory GitHub intake ('I don't have one' → RCA-only;
headless missing-input fail-fast), discovery via listTestIds(failed,
includeFailureDetail), then cluster/pre-compute/fan-out/report steps.
lib/csv-state.mjs is the resumable WAL spine — seed (idempotent, terminal-
preserving), claim/heartbeat/flip, reaper, pendingRows — with timestamps injected
(workflow-sandbox-safe) and an RFC4180 codec for multiline RCA fields.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 lib/csv-state.mjs         | 239 ++++++++++++++++++++++++++++++++++++++
 skills/rca-build/SKILL.md | 124 ++++++++++++++++++++
 tests/csv-state.test.mjs  | 133 +++++++++++++++++++++
 3 files changed, 496 insertions(+)
 create mode 100644 lib/csv-state.mjs
 create mode 100644 skills/rca-build/SKILL.md
 create mode 100644 tests/csv-state.test.mjs

diff --git a/lib/csv-state.mjs b/lib/csv-state.mjs
new file mode 100644
index 0000000..499f997
--- /dev/null
+++ b/lib/csv-state.mjs
@@ -0,0 +1,239 @@
+// CSV write-ahead-log spine for the batch (D4 + ideation #7). The CSV is the
+// single durable, resumable source of truth for "RCA over ALL failed tests":
+// every test is a row, seeded `pending`, claimed by a worker, heartbeated while
+// in flight, and flipped to a terminal state with its RCA. A reaper reclaims
+// rows stranded by a crashed worker.
+//
+// Timestamps are passed in as `nowMs` (never read from the clock here) so this
+// module is deterministic in tests AND usable from the auto-mode dynamic
+// workflow, whose sandbox forbids Date.now().
+//
+// In-session / in-workspace only — cross-session durability is deferred. Writes
+// are synchronous read-modify-write; Node's single thread serializes them, which
+// is sufficient for the in-process 5-concurrent workflow (true multi-process
+// locking is out of scope).
+
+import { readFileSync, writeFileSync, existsSync, mkdirSync } from "node:fs";
+import { dirname } from "node:path";
+
+export const COLUMNS = [
+  "buildId",
+  "testRunId",
+  "testName",
+  "failure_category",
+  "error_summary",
+  "file_path",
+  "cluster_id",
+  "rca_done",
+  "in_flight_worker",
+  "heartbeat_ts",
+  "threadId",
+  "turnId",
+  "last_evidence_digest",
+  "root_cause",
+  "failure_type",
+  "possible_fix",
+  "related_prs",
+  "coverage",
+  "confidence",
+  "timestamp",
+];
+
+export const PENDING = "pending";
+const TERMINAL_STATES = new Set([
+  "resolved",
+  "blocked",
+  "failed",
+  "pending-resume",
+]);
+
+// ---- minimal RFC4180-ish CSV codec ----------------------------------------
+
+function encodeField(value) {
+  const s = value == null ? "" : String(value);
+  if (/[",\r\n]/.test(s)) {
+    return `"${s.replace(/"/g, '""')}"`;
+  }
+  return s;
+}
+
+function encodeRows(rows) {
+  const lines = [COLUMNS.join(",")];
+  for (const row of rows) {
+    lines.push(COLUMNS.map((c) => encodeField(row[c])).join(","));
+  }
+  return lines.join("\n") + "\n";
+}
+
+function parseCsv(text) {
+  const rows = [];
+  let field = "";
+  let record = [];
+  let inQuotes = false;
+  for (let i = 0; i < text.length; i++) {
+    const ch = text[i];
+    if (inQuotes) {
+      if (ch === '"') {
+        if (text[i + 1] === '"') {
+          field += '"';
+          i++;
+        } else {
+          inQuotes = false;
+        }
+      } else {
+        field += ch;
+      }
+    } else if (ch === '"') {
+      inQuotes = true;
+    } else if (ch === ",") {
+      record.push(field);
+      field = "";
+    } else if (ch === "\n" || ch === "\r") {
+      if (ch === "\r" && text[i + 1] === "\n") i++;
+      record.push(field);
+      rows.push(record);
+      field = "";
+      record = [];
+    } else {
+      field += ch;
+    }
+  }
+  if (field.length > 0 || record.length > 0) {
+    record.push(field);
+    rows.push(record);
+  }
+  return rows;
+}
+
+// ---- read / write ----------------------------------------------------------
+
+export function readRows(csvPath) {
+  if (!existsSync(csvPath)) return [];
+  const text = readFileSync(csvPath, "utf8");
+  const raw = parseCsv(text).filter((r) => r.some((c) => c.length > 0));
+  if (raw.length === 0) return [];
+  const header = raw[0];
+  return raw.slice(1).map((cells) => {
+    const row = {};
+    header.forEach((col, idx) => {
+      row[col] = cells[idx] ?? "";
+    });
+    return row;
+  });
+}
+
+export function writeRows(csvPath, rows) {
+  const dir = dirname(csvPath);
+  if (dir && !existsSync(dir)) mkdirSync(dir, { recursive: true });
+  writeFileSync(csvPath, encodeRows(rows), "utf8");
+}
+
+function emptyRow() {
+  return Object.fromEntries(COLUMNS.map((c) => [c, ""]));
+}
+
+// ---- operations -------------------------------------------------------------
+
+// Seed the CSV from a listTestIds(failed, includeFailureDetail) payload. Every
+// row starts `pending`. Idempotent: existing rows are preserved (terminal rows
+// are never reset; signature columns are refreshed on still-pending rows). New
+// tests are appended. Returns the full row set.
+export function seed(csvPath, buildId, tests) {
+  const existing = readRows(csvPath);
+  const byId = new Map(existing.map((r) => [String(r.testRunId), r]));
+
+  for (const t of tests) {
+    const id = String(t.test_id ?? t.testRunId);
+    const sig = t.failure ?? {};
+    const prior = byId.get(id);
+    if (prior) {
+      // Keep terminal results; only refresh signature on still-pending rows.
+      if (prior.rca_done === PENDING) {
+        prior.failure_category = sig.category ?? prior.failure_category;
+        prior.error_summary = sig.error_summary ?? prior.error_summary;
+        prior.file_path = sig.file_path ?? prior.file_path;
+      }
+      continue;
+    }
+    const row = emptyRow();
+    row.buildId = buildId;
+    row.testRunId = id;
+    row.testName = t.test_name ?? t.testName ?? `Test ${id}`;
+    row.failure_category = sig.category ?? "";
+    row.error_summary = sig.error_summary ?? "";
+    row.file_path = sig.file_path ?? "";
+    row.rca_done = PENDING;
+    byId.set(id, row);
+    existing.push(row);
+  }
+
+  writeRows(csvPath, existing);
+  return existing;
+}
+
+// Claim a pending row for `worker`. Refuses (returns false) if another worker
+// already owns it. Returns true on success.
+export function claim(csvPath, testRunId, worker, nowMs) {
+  const rows = readRows(csvPath);
+  const row = rows.find((r) => String(r.testRunId) === String(testRunId));
+  if (!row) return false;
+  if (row.in_flight_worker && row.in_flight_worker !== worker) return false;
+  if (TERMINAL_STATES.has(row.rca_done)) return false;
+  row.in_flight_worker = worker;
+  row.heartbeat_ts = String(nowMs);
+  writeRows(csvPath, rows);
+  return true;
+}
+
+export function heartbeat(csvPath, testRunId, worker, nowMs) {
+  const rows = readRows(csvPath);
+  const row = rows.find((r) => String(r.testRunId) === String(testRunId));
+  if (!row || row.in_flight_worker !== worker) return false;
+  row.heartbeat_ts = String(nowMs);
+  writeRows(csvPath, rows);
+  return true;
+}
+
+// Flip a row to a terminal state, recording the RCA fields and clearing the
+// in-flight claim. `fields` carries any of: rca_done, root_cause, failure_type,
+// possible_fix, related_prs, threadId, turnId, coverage, confidence,
+// last_evidence_digest, cluster_id.
+export function flip(csvPath, testRunId, fields, nowMs) {
+  const rows = readRows(csvPath);
+  const row = rows.find((r) => String(r.testRunId) === String(testRunId));
+  if (!row) return false;
+  for (const [k, v] of Object.entries(fields)) {
+    if (COLUMNS.includes(k)) {
+      row[k] = Array.isArray(v) ? v.join("; ") : (v ?? "");
+    }
+  }
+  row.in_flight_worker = "";
+  row.timestamp = String(nowMs);
+  writeRows(csvPath, rows);
+  return true;
+}
+
+// Reclaim rows stranded in flight (heartbeat older than ttlSec) back to pending.
+// Returns the testRunIds reclaimed. Run on startup before resuming a batch.
+export function reaper(csvPath, ttlSec, nowMs) {
+  const rows = readRows(csvPath);
+  const reclaimed = [];
+  for (const row of rows) {
+    if (!row.in_flight_worker) continue;
+    if (TERMINAL_STATES.has(row.rca_done)) continue;
+    const hb = Number(row.heartbeat_ts);
+    const stale = !row.heartbeat_ts || nowMs - hb > ttlSec * 1000;
+    if (stale) {
+      row.in_flight_worker = "";
+      row.rca_done = PENDING;
+      reclaimed.push(String(row.testRunId));
+    }
+  }
+  if (reclaimed.length > 0) writeRows(csvPath, rows);
+  return reclaimed;
+}
+
+// Rows still needing work (pending or reclaimed). The work-list for fan-out.
+export function pendingRows(csvPath) {
+  return readRows(csvPath).filter((r) => r.rca_done === PENDING);
+}
diff --git a/skills/rca-build/SKILL.md b/skills/rca-build/SKILL.md
new file mode 100644
index 0000000..4bf6b34
--- /dev/null
+++ b/skills/rca-build/SKILL.md
@@ -0,0 +1,124 @@
+---
+name: rca-build
+description: Run collaborative root-cause analysis over ALL failed tests of a BrowserStack build. Generic across product and infra. Mandatory pre-flight GitHub intake, then discovery via listTestIds, failure-signature clustering, and per-test RCA via tfaRcaTurn (auto = dynamic workflow / interactive = subagents). Use when a build is red and you want a per-test RCA for every failure in the TRA dashboard.
+---
+
+# rca-build — batch collaborative RCA over a build
+
+Drives the `tfaRcaTurn` collaborative loop over **every failed test** of a build
+and records a per-test RCA. **TFA owns logs; the client agent owns everything
+else** (product code, k8s, kibana, metrics, deploy, ci) — routed by capability,
+generic over product and infra.
+
+This skill is the **build-level orchestrator** (`ai-tfa-orchestrator` role). It
+never calls `tfaRcaTurn` itself — it dispatches the `ai-tfa-coordinator`
+(test-level) per test/cluster member, which drives the loop and lets TFA author
+the dashboard RCA.
+
+Config (concurrency, turn-cap, paths, evidence registry) lives in
+`config/rca.config.json`. State lives in the CSV/WAL spine (`lib/csv-state.mjs`).
+
+## Step 0 — mode + input
+
+Parse from `/rca-build` args: the build id and optional `mode=auto|interactive`.
+
+- No build id present → it is required:
+  - interactive session → ask the user.
+  - **headless (`claude -p`) with build id missing → end immediately (fail fast).**
+- No mode given → ask the user once (auto vs interactive). In headless, default `auto`.
+
+## Step 1 — pre-flight intake (F1, mandatory, both modes)
+
+Ask the user (A1) for, in one pass:
+
+- product repo name, automation (test) repo name
+- working branch, default branch
+- the PRs in play (product + automation)
+- the build id (if not already supplied)
+
+Every question is **mandatory to ask** but answerable with **"I don't have one"**
+→ record the gap and proceed **RCA-only** (BrowserStack-side evidence + whatever
+infra skills exist). Do not block the run on missing GitHub context.
+
+**Headless rule:** in `claude -p`, any *required* input still missing after
+parsing (build id) ends the run immediately. Optional intake answers default to
+"none" without prompting.
+
+## Step 2 — discovery (F2)
+
+Call the bundled MCP tool:
+
+```
+listTestIds(buildId=<id>, status="failed", includeFailureDetail=true)
+```
+
+`includeFailureDetail=true` returns each row's trimmed failure signature
+(`failure.{category, error_summary, file_path, …}`) — the seed for clustering, so
+no per-test probe turns are needed.
+
+Seed the CSV/WAL spine from the payload (`lib/csv-state.mjs` → `seed`): one row
+per failed test, every row `rca_done=pending`, signature columns populated.
+Re-running `seed` on an existing CSV is idempotent and preserves terminal rows
+(resume-safe). If `listTestIds` returns empty → write an empty CSV, report "no
+failed tests", stop.
+
+## Step 3 — failure-signature clustering (see references/clustering.md)
+
+Compute a failure signature per row and assign `cluster_id` (`lib/signature.mjs`).
+Each cluster gets one **representative** (full multi-turn loop) and `N−1`
+**siblings** (pre-seeded one-turn confirm against their own logs). This collapses
+the expensive evidence hunt to O(distinct causes) while every test still lands a
+per-test RCA. Singleton clusters are just plain per-test loops.
+
+## Step 4 — build-evidence pre-compute + capability manifest (see references/evidence-routing.md)
+
+Once, before fan-out:
+
+- **Capability manifest** — enumerate the skills/tools the client actually has
+  into `capability → {available, via}` (GitHub, k8s, logs, metrics, …). Declare
+  to the user up front what will be **unavailable** ("k8s + metrics not
+  available"). Every coordinator routes asks against this manifest.
+- **Build-level evidence** — compute the last-green→this-build delta (diff,
+  deploy timeline, suspect-PR window) **once** and pre-seed every coordinator
+  with the same grounded window. Cache by `(repo, commit-range)`. No "last green"
+  baseline (never-green suite) → fall back to a configured baseline ref and log it.
+
+## Step 5 — fan-out (the mode fork)
+
+Drive the cluster work-list, **`concurrency` (default 5) at a time**:
+representatives deep, siblings one-turn-confirm. Eagerly persist to the CSV/WAL
+(claim → heartbeat → flip) so the run is resumable.
+
+- **auto** → run the dynamic workflow `workflows/rca-batch.mjs` (script-orchestrated,
+  no user input; gap → "unavailable" back to TFA → best-effort finalize).
+- **interactive** → spawn `ai-tfa-coordinator` subagents 5 at a time; on an
+  evidence gap a subagent returns the gap to this orchestrator, which asks the
+  user (A1), then feeds the answer back. Subagents return compact `RCA_OUTPUT`
+  blocks, not transcripts (keeps the main context lean for large batches).
+
+Both modes use the **same** `ai-tfa-coordinator`; only the injected gap-resolver
+differs. A coordinator that dies becomes a recorded `failed` row — one stuck test
+never sinks the batch (partial-first).
+
+## Step 6 — report (see references/report-format.md)
+
+When every row is terminal, render the report (`paths.reportFile`): per-test rows
+with status + the **evidence-coverage band** (a RESOLVED built with evidence
+unavailable reads as lower confidence than a fully-evidenced one). Degrade,
+don't crash — missing fields render as "not available".
+
+## Resume
+
+On startup, run the reaper (`lib/csv-state.mjs` → `reaper`) to reclaim rows
+stranded `in_flight` by a crashed worker (heartbeat older than
+`reaperHeartbeatTtlSec`) back to `pending`, then re-point fan-out at the CSV.
+Live `threadId`/`turnId` resume the prior thread; dead threads re-run from
+pending. (In-session only — cross-session durability is deferred.)
+
+## Hard rules
+
+- Always run the pre-flight intake; never silently skip it (but never block on "I don't have one").
+- Headless + missing required input → end immediately.
+- Never call `tfaRcaTurn` from this skill — always via the `ai-tfa-coordinator`.
+- Every failed test must end terminal in the CSV — partial-first, no abort-on-one-failure.
+- Never gather `test_logs` — TFA owns logs.
diff --git a/tests/csv-state.test.mjs b/tests/csv-state.test.mjs
new file mode 100644
index 0000000..5a9a60f
--- /dev/null
+++ b/tests/csv-state.test.mjs
@@ -0,0 +1,133 @@
+import { test, beforeEach, afterEach } from "node:test";
+import assert from "node:assert/strict";
+import { mkdtempSync, rmSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import {
+  seed,
+  readRows,
+  claim,
+  heartbeat,
+  flip,
+  reaper,
+  pendingRows,
+  PENDING,
+} from "../lib/csv-state.mjs";
+
+let dir;
+let csv;
+
+beforeEach(() => {
+  dir = mkdtempSync(join(tmpdir(), "rca-csv-"));
+  csv = join(dir, "state.csv");
+});
+afterEach(() => rmSync(dir, { recursive: true, force: true }));
+
+const TESTS = [
+  {
+    test_id: 101,
+    test_name: "login",
+    failure: { category: "Assertion", error_summary: "expected 200", file_path: "a.rb" },
+  },
+  { test_id: 102, test_name: "checkout", failure: { category: "Timeout" } },
+];
+
+test("seed writes one pending row per test with signature columns", () => {
+  const rows = seed(csv, "build-1", TESTS);
+  assert.equal(rows.length, 2);
+  assert.ok(rows.every((r) => r.rca_done === PENDING));
+  const login = rows.find((r) => r.testRunId === "101");
+  assert.equal(login.failure_category, "Assertion");
+  assert.equal(login.error_summary, "expected 200");
+  assert.equal(login.buildId, "build-1");
+});
+
+test("seed is idempotent — no duplicate rows on re-seed", () => {
+  seed(csv, "build-1", TESTS);
+  const rows = seed(csv, "build-1", TESTS);
+  assert.equal(rows.length, 2);
+});
+
+test("seed preserves a terminal row on re-seed", () => {
+  seed(csv, "build-1", TESTS);
+  flip(csv, 101, { rca_done: "resolved", root_cause: "bad PR" }, 1000);
+  seed(csv, "build-1", TESTS);
+  const login = readRows(csv).find((r) => r.testRunId === "101");
+  assert.equal(login.rca_done, "resolved");
+  assert.equal(login.root_cause, "bad PR");
+});
+
+test("claim sets the worker; a second worker is refused", () => {
+  seed(csv, "build-1", TESTS);
+  assert.equal(claim(csv, 101, "w1", 1000), true);
+  assert.equal(claim(csv, 101, "w2", 1000), false);
+  const row = readRows(csv).find((r) => r.testRunId === "101");
+  assert.equal(row.in_flight_worker, "w1");
+});
+
+test("heartbeat updates ts only for the owning worker", () => {
+  seed(csv, "build-1", TESTS);
+  claim(csv, 101, "w1", 1000);
+  assert.equal(heartbeat(csv, 101, "w1", 2000), true);
+  assert.equal(heartbeat(csv, 101, "w2", 3000), false);
+  assert.equal(readRows(csv).find((r) => r.testRunId === "101").heartbeat_ts, "2000");
+});
+
+test("flip records terminal fields, joins related_prs, clears the claim", () => {
+  seed(csv, "build-1", TESTS);
+  claim(csv, 101, "w1", 1000);
+  flip(
+    csv,
+    101,
+    { rca_done: "resolved", root_cause: "PR #7421", related_prs: ["#7421", "#7430"], confidence: "high" },
+    5000,
+  );
+  const row = readRows(csv).find((r) => r.testRunId === "101");
+  assert.equal(row.rca_done, "resolved");
+  assert.equal(row.related_prs, "#7421; #7430");
+  assert.equal(row.confidence, "high");
+  assert.equal(row.in_flight_worker, "");
+  assert.equal(row.timestamp, "5000");
+});
+
+test("reaper reclaims only stale in-flight rows", () => {
+  seed(csv, "build-1", TESTS);
+  claim(csv, 101, "w1", 1000); // stale
+  claim(csv, 102, "w2", 9000); // fresh
+  const ttl = 600; // seconds
+  const now = 1000 + ttl * 1000 + 1; // just past TTL for w1, fresh for w2
+  const reclaimed = reaper(csv, ttl, now);
+  assert.deepEqual(reclaimed, ["101"]);
+  const rows = readRows(csv);
+  assert.equal(rows.find((r) => r.testRunId === "101").in_flight_worker, "");
+  assert.equal(rows.find((r) => r.testRunId === "101").rca_done, PENDING);
+  assert.equal(rows.find((r) => r.testRunId === "102").in_flight_worker, "w2");
+});
+
+test("reaper leaves terminal rows alone even if in_flight lingered", () => {
+  seed(csv, "build-1", TESTS);
+  claim(csv, 101, "w1", 1000);
+  flip(csv, 101, { rca_done: "resolved" }, 2000); // flip clears in_flight
+  const reclaimed = reaper(csv, 600, 10_000_000);
+  assert.deepEqual(reclaimed, []);
+});
+
+test("pendingRows returns only pending work", () => {
+  seed(csv, "build-1", TESTS);
+  flip(csv, 101, { rca_done: "resolved" }, 1000);
+  const pend = pendingRows(csv);
+  assert.equal(pend.length, 1);
+  assert.equal(pend[0].testRunId, "102");
+});
+
+test("CSV codec round-trips fields with commas, quotes, newlines", () => {
+  seed(csv, "build-1", [{ test_id: 200, test_name: "weird" }]);
+  flip(
+    csv,
+    200,
+    { rca_done: "resolved", root_cause: 'Failed: "x", got <y>\nsecond line' },
+    1000,
+  );
+  const row = readRows(csv).find((r) => r.testRunId === "200");
+  assert.equal(row.root_cause, 'Failed: "x", got <y>\nsecond line');
+});

From bbee37db77095abfda7a1c83a7b42180ef12a901 Mon Sep 17 00:00:00 2001
From: Ruturaj-Browserstack <ruturaj.s@browserstack.com>
Date: Tue, 23 Jun 2026 22:03:37 +0530
Subject: [PATCH 04/12] feat(rca): failure-signature clustering + sibling
 one-turn-confirm protocol

lib/signature.mjs computes signature = normalize(category|error|file) off the U1
discovery payload (folds timestamps/uuids/hex/line:col/numbers), groups rows by
signature, picks a deterministic representative (non-flaky, then smallest id),
and leaves signal-less rows as their own singletons. references/clustering.md
documents the O(causes) protocol: representative runs the full loop; siblings
pre-seed a one-turn confirm against their own logs with a fall-back-to-own-loop
safeguard (never blindly inherit).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 lib/signature.mjs                         | 78 ++++++++++++++++++++++
 skills/rca-build/references/clustering.md | 60 +++++++++++++++++
 tests/signature.test.mjs                  | 79 +++++++++++++++++++++++
 3 files changed, 217 insertions(+)
 create mode 100644 lib/signature.mjs
 create mode 100644 skills/rca-build/references/clustering.md
 create mode 100644 tests/signature.test.mjs

diff --git a/lib/signature.mjs b/lib/signature.mjs
new file mode 100644
index 0000000..42dc0ae
--- /dev/null
+++ b/lib/signature.mjs
@@ -0,0 +1,78 @@
+// Failure-signature clustering (ideation #1). A red build's N failures usually
+// trace to a handful of causes; clustering collapses the expensive evidence hunt
+// to O(distinct causes). The signature is computed from the trimmed failure
+// detail U1 surfaces on each listTestIds row (category + first error line + file
+// path) — no extra probe turns.
+//
+// Dependency-free + deterministic (no crypto, no clock, no random) so it is
+// usable from the auto-mode workflow sandbox and trivially testable.
+
+// Normalize a string for signature comparison: lowercase and fold the volatile
+// tokens that make two instances of the SAME failure look different (ids,
+// timestamps, hex/uuids, line:col, bare numbers).
+export function normalize(value) {
+  return String(value ?? "")
+    .toLowerCase()
+    .replace(/\b\d{4}-\d{2}-\d{2}[t ]\d{2}:\d{2}:\d{2}\S*/g, "<ts>") // ISO timestamps
+    .replace(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/g, "<uuid>")
+    .replace(/0x[0-9a-f]+/g, "<hex>") // memory addresses
+    .replace(/:\d+(:\d+)?\b/g, ":<line>") // file:line(:col)
+    .replace(/\d+/g, "<n>") // remaining numbers (incl. unit-suffixed, e.g. 3000ms)
+    .replace(/\s+/g, " ")
+    .trim();
+}
+
+// The signature triple: normalized category | error summary | file path.
+export function computeSignature(row) {
+  const category = normalize(row.failure_category);
+  const error = normalize(row.error_summary);
+  const file = normalize(row.file_path);
+  const sig = `${category}|${error}|${file}`;
+  return sig.replace(/\|/g, "").trim().length === 0 ? "" : sig;
+}
+
+// Deterministic short id for a signature string (FNV-1a → base36).
+function hashId(s) {
+  let h = 0x811c9dc5;
+  for (let i = 0; i < s.length; i++) {
+    h ^= s.charCodeAt(i);
+    h = Math.imul(h, 0x01000193);
+  }
+  return (h >>> 0).toString(36);
+}
+
+// A stable representative for a cluster: prefer a non-flaky member (a flaky test
+// is a poor exemplar), then the smallest testRunId. Deterministic.
+export function selectRepresentative(members) {
+  return [...members].sort((a, b) => {
+    const aFlaky = a.is_flaky === "true" || a.is_flaky === true ? 1 : 0;
+    const bFlaky = b.is_flaky === "true" || b.is_flaky === true ? 1 : 0;
+    if (aFlaky !== bFlaky) return aFlaky - bFlaky;
+    return Number(a.testRunId) - Number(b.testRunId);
+  })[0];
+}
+
+// Cluster rows by signature. Mutates each row's `cluster_id`. Rows with no
+// signal (empty signature) become their own singleton (never merged into a
+// catch-all). Returns { rows, clusters } where each cluster carries its
+// representative + siblings.
+export function clusterRows(rows) {
+  const groups = new Map();
+
+  for (const row of rows) {
+    const sig = computeSignature(row);
+    const id = sig === "" ? `solo-${row.testRunId}` : `c-${hashId(sig)}`;
+    row.cluster_id = id;
+    if (!groups.has(id)) groups.set(id, { cluster_id: id, signature: sig, members: [] });
+    groups.get(id).members.push(row);
+  }
+
+  const clusters = [];
+  for (const group of groups.values()) {
+    const representative = selectRepresentative(group.members);
+    const siblings = group.members.filter((m) => m !== representative);
+    clusters.push({ ...group, representative, siblings });
+  }
+
+  return { rows, clusters };
+}
diff --git a/skills/rca-build/references/clustering.md b/skills/rca-build/references/clustering.md
new file mode 100644
index 0000000..66face8
--- /dev/null
+++ b/skills/rca-build/references/clustering.md
@@ -0,0 +1,60 @@
+# Failure-signature clustering
+
+Why: a red build's N failures usually trace to a handful of causes (one bad
+PR/deploy/shared helper). Running the full collaborative loop once per *cause*
+instead of once per *test* turns the dominant cost from **O(tests) → O(distinct
+causes)** — the only thing that makes "RCA for ALL failed tests, even thousands"
+feasible. But **every failed test must still show a per-test RCA in the TRA
+dashboard**, so clustering collapses the *evidence hunt*, not the *output*.
+
+The logic lives in `lib/signature.mjs`; this file is the protocol.
+
+## The signature
+
+Computed from the trimmed failure detail `listTestIds(includeFailureDetail=true)`
+already returns on each row — **no extra probe turns**:
+
+```
+signature = normalize(failure_category) | normalize(error_summary) | normalize(file_path)
+```
+
+`normalize` folds the volatile tokens that make two instances of the *same*
+failure look different: ISO timestamps, UUIDs, hex/memory addresses, `file:line:col`,
+and bare numbers. So `timeout after 3000ms on node-7` and `timeout after 5000ms
+on node-2` share a signature.
+
+A row with **no signal** (empty category, error, and path) is **not** merged into
+a catch-all — it becomes its own singleton (`solo-<testRunId>`). Better an
+un-clustered test than a wrong cluster.
+
+## Representative + siblings
+
+Each cluster gets:
+
+- **Representative** — a stable exemplar (non-flaky preferred, then smallest
+  `testRunId`). Runs the **full multi-turn `ai-tfa-coordinator` loop** →
+  confirmed root cause + culprit `related_prs`.
+- **Siblings** (`N−1`) — each runs its **own** coordinator, **pre-seeded** with
+  the representative's `root_cause` + suspect PRs. TFA confirms the hypothesis
+  **against that sibling's own logs in a single turn** → a logs-grounded per-test
+  RCA in the dashboard at minimal cost.
+
+Net cost per cluster: **1 deep investigation + (N−1) one-turn confirms.**
+
+## The safeguard — never blindly inherit
+
+Distinct failures can share an error string. A sibling's pre-seed turn is a
+*hypothesis to confirm*, not a verdict to copy:
+
+- TFA `RESOLVED`s the sibling in one turn → logs-grounded inheritance, cheap. 
+- TFA returns `NEEDS_INFO` / `BLOCKED` (the hypothesis does not hold for this
+  test's logs) → the sibling **falls back to its own full loop**. The
+  representative's cause is never stamped onto a sibling without log confirmation.
+
+This keeps correctness independent of the cost optimization: worst case, every
+sibling runs its own full loop (same as no clustering); best case, one deep run
+covers the whole cluster.
+
+## Singletons
+
+A cluster of one is just a plain per-test loop — no pre-seed, no confirm step.
diff --git a/tests/signature.test.mjs b/tests/signature.test.mjs
new file mode 100644
index 0000000..f721167
--- /dev/null
+++ b/tests/signature.test.mjs
@@ -0,0 +1,79 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import {
+  normalize,
+  computeSignature,
+  selectRepresentative,
+  clusterRows,
+} from "../lib/signature.mjs";
+
+function row(id, extra = {}) {
+  return {
+    testRunId: String(id),
+    failure_category: "Assertion",
+    error_summary: "expected 200 but got 500",
+    file_path: "spec/login.rb",
+    is_flaky: "false",
+    ...extra,
+  };
+}
+
+test("normalize folds timestamps, uuids, hex, line:col, and numbers", () => {
+  assert.equal(normalize("Error at line :42:7"), "error at line :<line>");
+  assert.equal(normalize("got 500 at 0xAF3"), "got <n> at <hex>");
+  assert.equal(
+    normalize("failed 2026-06-23T10:00:00Z"),
+    "failed <ts>",
+  );
+});
+
+test("identical category+error+path → same cluster", () => {
+  const { clusters } = clusterRows([row(1), row(2)]);
+  assert.equal(clusters.length, 1);
+  assert.equal(clusters[0].members.length, 2);
+});
+
+test("numbers in the error are folded so siblings still cluster", () => {
+  const a = row(1, { error_summary: "timeout after 3000ms on node-7" });
+  const b = row(2, { error_summary: "timeout after 5000ms on node-2" });
+  assert.equal(computeSignature(a), computeSignature(b));
+  const { clusters } = clusterRows([a, b]);
+  assert.equal(clusters.length, 1);
+});
+
+test("distinct failures → distinct clusters", () => {
+  const a = row(1, { error_summary: "null pointer in Foo" });
+  const b = row(2, { error_summary: "connection refused" });
+  const { clusters } = clusterRows([a, b]);
+  assert.equal(clusters.length, 2);
+});
+
+test("rows with no signal become their own singletons (no catch-all merge)", () => {
+  const a = { testRunId: "1", failure_category: "", error_summary: "", file_path: "" };
+  const b = { testRunId: "2", failure_category: "", error_summary: "", file_path: "" };
+  const { clusters } = clusterRows([a, b]);
+  assert.equal(clusters.length, 2);
+  assert.ok(clusters.every((c) => c.cluster_id.startsWith("solo-")));
+});
+
+test("singleton cluster has a representative and no siblings", () => {
+  const { clusters } = clusterRows([row(1)]);
+  assert.equal(clusters[0].siblings.length, 0);
+  assert.equal(clusters[0].representative.testRunId, "1");
+});
+
+test("representative is deterministic: non-flaky, then smallest testRunId", () => {
+  const members = [
+    row(5, { is_flaky: "true" }),
+    row(9, { is_flaky: "false" }),
+    row(7, { is_flaky: "false" }),
+  ];
+  assert.equal(selectRepresentative(members).testRunId, "7");
+});
+
+test("clusterRows stamps cluster_id onto every row", () => {
+  const rows = [row(1), row(2, { error_summary: "different" })];
+  clusterRows(rows);
+  assert.ok(rows.every((r) => r.cluster_id));
+  assert.notEqual(rows[0].cluster_id, rows[1].cluster_id);
+});

From 7ddb2c222bd9b3b479a3bce82d3ed0c5e75de69c Mon Sep 17 00:00:00 2001
From: Ruturaj-Browserstack <ruturaj.s@browserstack.com>
Date: Tue, 23 Jun 2026 22:05:29 +0530
Subject: [PATCH 05/12] feat(rca): build-evidence pre-compute + cache +
 capability manifest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

buildManifest enumerates the client's discovered capabilities once into
capability→{available,via}, declared to the user + TFA so no evidence is asked
for that the client provably can't get. lib/evidence-cache.mjs computes the
last-green→this-build delta once and caches by (repo,range,evidenceType) — fresh
per-run Map, no module globals (multi-tenant-safe) — with resolveBaseline for the
never-green fallback. Routes the same grounded window into every coordinator.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 lib/evidence-cache.mjs                        | 47 ++++++++++++
 lib/routing.mjs                               | 31 ++++++++
 .../rca-build/references/evidence-routing.md  | 29 +++++++
 tests/evidence.test.mjs                       | 76 +++++++++++++++++++
 4 files changed, 183 insertions(+)
 create mode 100644 lib/evidence-cache.mjs
 create mode 100644 tests/evidence.test.mjs

diff --git a/lib/evidence-cache.mjs b/lib/evidence-cache.mjs
new file mode 100644
index 0000000..b9c9523
--- /dev/null
+++ b/lib/evidence-cache.mjs
@@ -0,0 +1,47 @@
+// Build-level evidence cache (ideation #2). "Diff since last green", "deploy
+// timeline", "PRs in the suspect window" are properties of the BUILD, not the
+// test — yet a naive loop re-fetches them per test. Compute once, cache by
+// (repo, commit-range, evidenceType), and pre-seed every coordinator with the
+// same grounded suspect window. Collapses N×M redundant git/infra calls to ~M.
+//
+// The cache is created fresh per run (function-scoped Map — never a module-level
+// global), so it holds no cross-run/cross-user state: in-workspace, single
+// session, multi-tenant-safe by construction.
+
+export function makeEvidenceCache() {
+  const store = new Map();
+  const keyOf = (repo, range, evidenceType) =>
+    `${repo ?? ""}@@${range ?? ""}@@${evidenceType ?? ""}`;
+
+  return {
+    has(repo, range, evidenceType) {
+      return store.has(keyOf(repo, range, evidenceType));
+    },
+    get(repo, range, evidenceType) {
+      return store.get(keyOf(repo, range, evidenceType));
+    },
+    set(repo, range, evidenceType, value) {
+      store.set(keyOf(repo, range, evidenceType), value);
+      return value;
+    },
+    // Compute-once: run `fn` only on a cache miss; reuse on every later call.
+    async compute(repo, range, evidenceType, fn) {
+      const k = keyOf(repo, range, evidenceType);
+      if (store.has(k)) return store.get(k);
+      const value = await fn();
+      store.set(k, value);
+      return value;
+    },
+    size() {
+      return store.size;
+    },
+  };
+}
+
+// Resolve the baseline ref for the last-green→this-build delta. When there is no
+// "last green" (e.g. a never-green flaky suite) fall back to a configured ref and
+// flag it so the report can note the weaker grounding.
+export function resolveBaseline(lastGreenRef, fallbackRef) {
+  if (lastGreenRef) return { ref: lastGreenRef, isFallback: false };
+  return { ref: fallbackRef ?? null, isFallback: true };
+}
diff --git a/lib/routing.mjs b/lib/routing.mjs
index 291738e..ec7c4e7 100644
--- a/lib/routing.mjs
+++ b/lib/routing.mjs
@@ -73,3 +73,34 @@ export function routeAsks(asks, config, manifest = {}) {
   }
   return buckets;
 }
+
+// ---- capability manifest (ideation #3) -------------------------------------
+
+// Build the capability manifest ONCE per run from the capabilities the client
+// agent actually discovered. `discovered` is a list of
+// { capability, via } the orchestrator collected by asking "what skills/tools
+// are available?". Every capability the routing registry references (except the
+// TFA-owned test_logs) appears in the manifest, marked available iff discovered.
+// Declaring this to TFA lets it avoid asking for evidence the client can't get.
+export function buildManifest(config, discovered = []) {
+  const byCap = new Map(discovered.map((d) => [d.capability, d]));
+  const manifest = {};
+  for (const entry of Object.values(config?.evidenceRouting ?? {})) {
+    if (entry.skip || entry.owner === "tfa") continue;
+    const cap = entry.capability;
+    if (!cap || cap in manifest) continue;
+    const found = byCap.get(cap);
+    manifest[cap] = found
+      ? { available: true, via: found.via ?? null }
+      : { available: false, via: null };
+  }
+  return manifest;
+}
+
+// Capabilities that will be unavailable this run — declared to the user up front
+// ("k8s + metrics not available") and to TFA so it plans asks around them.
+export function unavailableCapabilities(manifest) {
+  return Object.entries(manifest)
+    .filter(([, v]) => !v.available)
+    .map(([cap]) => cap);
+}
diff --git a/skills/rca-build/references/evidence-routing.md b/skills/rca-build/references/evidence-routing.md
index e6cc4d0..87fb255 100644
--- a/skills/rca-build/references/evidence-routing.md
+++ b/skills/rca-build/references/evidence-routing.md
@@ -131,3 +131,32 @@ SUMMARY: not-found | unreachable | unavailable | out-of-scope — <one line: wha
 An all-`unavailable` / all-`not-found` turn still resubmits — TFA decides whether
 the gap is fatal (→ BLOCKED) or it can converge anyway (best-effort, lower
 confidence). The coordinator does not pre-empt that decision.
+
+---
+
+## Capability manifest (built once per run)
+
+Rather than re-discover "is there a kibana skill?" on every ask across every
+test, the orchestrator enumerates the client's available skills/tools **once** up
+front into a manifest (`lib/routing.mjs` → `buildManifest`):
+
+```
+{ github: {available: true, via: "github-mcp"}, k8s: {available: false}, ... }
+```
+
+- Every ask routes against this manifest — reproducible, no per-ask discovery.
+- The orchestrator **declares the unavailable capabilities to the user** up front
+  ("k8s + metrics will be unavailable") and includes them in the first turn so
+  TFA plans asks around what's obtainable.
+- Frozen at run start. A skill appearing mid-run is not picked up until the next run.
+
+## Build-level evidence cache (compute once)
+
+"Diff since last green", "deploy timeline", and "PRs in the suspect window" are
+properties of the **build**, not the test. The orchestrator computes the
+last-green→this-build delta **once** (`lib/evidence-cache.mjs`), caches it by
+`(repo, commit-range, evidenceType)`, and pre-seeds every coordinator with the
+same grounded suspect window — collapsing N×M redundant git/infra calls to ~M and
+front-loading the highest-signal evidence so many tests RESOLVE before any infra
+ask fires. No "last green" (never-green suite) → fall back to a configured
+baseline ref and note the weaker grounding in the report.
diff --git a/tests/evidence.test.mjs b/tests/evidence.test.mjs
new file mode 100644
index 0000000..e01b06a
--- /dev/null
+++ b/tests/evidence.test.mjs
@@ -0,0 +1,76 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { buildManifest, unavailableCapabilities } from "../lib/routing.mjs";
+import { makeEvidenceCache, resolveBaseline } from "../lib/evidence-cache.mjs";
+
+const CONFIG = {
+  evidenceRouting: {
+    test_logs: { owner: "tfa", skip: true },
+    product_code: { capability: "github" },
+    deploy: { capability: "github" },
+    k8s: { capability: "k8s" },
+    metrics: { capability: "metrics" },
+    other: { capability: "other" },
+  },
+};
+
+test("buildManifest marks discovered capabilities available with via", () => {
+  const manifest = buildManifest(CONFIG, [
+    { capability: "github", via: "github-mcp" },
+  ]);
+  assert.equal(manifest.github.available, true);
+  assert.equal(manifest.github.via, "github-mcp");
+  assert.equal(manifest.k8s.available, false);
+});
+
+test("buildManifest excludes the TFA-owned test_logs capability", () => {
+  const manifest = buildManifest(CONFIG, []);
+  assert.ok(!("undefined" in manifest));
+  assert.ok(!Object.keys(manifest).includes("test_logs"));
+});
+
+test("buildManifest dedupes capabilities shared by multiple evidence types", () => {
+  // product_code + deploy both map to github → one manifest entry
+  const manifest = buildManifest(CONFIG, [{ capability: "github" }]);
+  assert.equal(Object.keys(manifest).filter((k) => k === "github").length, 1);
+});
+
+test("unavailableCapabilities lists what the client can't get", () => {
+  const manifest = buildManifest(CONFIG, [{ capability: "github" }]);
+  const unavailable = unavailableCapabilities(manifest).sort();
+  assert.deepEqual(unavailable, ["k8s", "metrics", "other"]);
+});
+
+test("evidence cache computes once and reuses across calls", async () => {
+  const cache = makeEvidenceCache();
+  let calls = 0;
+  const fn = async () => {
+    calls++;
+    return { prs: ["#1"] };
+  };
+  const a = await cache.compute("repo", "abc..def", "deploy", fn);
+  const b = await cache.compute("repo", "abc..def", "deploy", fn);
+  assert.equal(calls, 1);
+  assert.deepEqual(a, b);
+  assert.equal(cache.size(), 1);
+});
+
+test("evidence cache key distinguishes commit ranges", async () => {
+  const cache = makeEvidenceCache();
+  let calls = 0;
+  const fn = async () => ++calls;
+  await cache.compute("repo", "r1", "deploy", fn);
+  await cache.compute("repo", "r2", "deploy", fn);
+  assert.equal(calls, 2);
+});
+
+test("resolveBaseline uses last-green when present, else flags fallback", () => {
+  assert.deepEqual(resolveBaseline("v1.2.3", "main"), {
+    ref: "v1.2.3",
+    isFallback: false,
+  });
+  assert.deepEqual(resolveBaseline(null, "main"), {
+    ref: "main",
+    isFallback: true,
+  });
+});

From d6f0452298a8a923105bf473388b7d5bfd1b50ce Mon Sep 17 00:00:00 2001
From: Ruturaj-Browserstack <ruturaj.s@browserstack.com>
Date: Tue, 23 Jun 2026 22:07:26 +0530
Subject: [PATCH 06/12] feat(rca): auto-mode dynamic workflow (rca-batch)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

workflows/rca-batch.mjs orchestrates the batch in auto mode: a pipeline over
clusters dispatches ai-tfa-coordinator agents — representative full loop →
siblings one-turn-confirm, no barrier between stages — with a structured RCA
schema. Sandbox-correct: does no state I/O itself (orchestrator passes the
clustered work-list + manifest + pre-computed build evidence via args; each
coordinator agent persists its own CSV row eagerly). Gap → 'unavailable' back to
TFA, no user prompt.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 workflows/rca-batch.mjs | 130 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 workflows/rca-batch.mjs

diff --git a/workflows/rca-batch.mjs b/workflows/rca-batch.mjs
new file mode 100644
index 0000000..e577dbe
--- /dev/null
+++ b/workflows/rca-batch.mjs
@@ -0,0 +1,130 @@
+export const meta = {
+  name: "rca-batch",
+  description:
+    "Drive collaborative RCA over all failed tests of a build (auto mode): cluster representatives run the full loop, siblings one-turn-confirm, ~5 concurrent.",
+  phases: [
+    { title: "Representatives", detail: "full multi-turn RCA per cluster" },
+    { title: "Siblings", detail: "one-turn confirm against own logs" },
+  ],
+};
+
+// AUTO MODE orchestration (D2). This is a dynamic-workflow script: it runs in the
+// Workflow sandbox (no filesystem, no Date.now/Math.random, agent()/pipeline()
+// as globals). It therefore does NO state I/O itself — the orchestrator seeds the
+// CSV, clusters, and builds the manifest in normal context and passes the
+// work-list via `args`; each dispatched `ai-tfa-coordinator` agent (which HAS
+// tool access) claims + flips its own CSV row eagerly (WAL); this script
+// orchestrates concurrency and returns the structured results for reconciliation.
+//
+// args shape:
+// {
+//   csvPath, buildId, mode: "auto",
+//   manifest: { capability: { available, via } },
+//   buildEvidence: { baselineRef, suspectWindow, ... },   // pre-computed once
+//   clusters: [
+//     { cluster_id, representative: { testRunId, testName, error_summary },
+//       siblings: [ { testRunId, testName, error_summary } ] }
+//   ]
+// }
+
+const RCA_SCHEMA = {
+  type: "object",
+  required: ["testRunId", "status"],
+  properties: {
+    testRunId: { type: "string" },
+    status: { enum: ["RESOLVED", "BLOCKED", "PENDING", "failed"] },
+    confidence: { enum: ["high", "medium", "low", "unknown"] },
+    root_cause: { type: "string" },
+    possible_fix: { type: "string" },
+    related_prs: { type: "array", items: { type: "string" } },
+    suspect_signals: { type: "array", items: { type: "string" } },
+    threadId: { type: "string" },
+    turnId: { type: "string" },
+    turns_used: { type: "number" },
+    asks_fulfilled: { type: "array", items: { type: "string" } },
+    asks_skipped: { type: "array", items: { type: "string" } },
+    asks_unavailable: { type: "array", items: { type: "string" } },
+    cluster_id: { type: "string" },
+  },
+  additionalProperties: true,
+};
+
+const ctx = args ?? {};
+const clusters = ctx.clusters ?? [];
+const shared = [
+  `CSV state file: ${ctx.csvPath}`,
+  `Capability manifest: ${JSON.stringify(ctx.manifest ?? {})}`,
+  `Build-level evidence (pre-computed once, reuse — do not re-fetch): ${JSON.stringify(ctx.buildEvidence ?? {})}`,
+  `Mode: auto — on an evidence gap with no capability, report "unavailable" back to TFA (NEVER prompt a user). Best-effort finalize.`,
+  `Persist eagerly to the CSV: claim your row before turn 1, flip it on terminal (lib/csv-state.mjs).`,
+].join("\n");
+
+function repPrompt(cluster) {
+  const r = cluster.representative;
+  return [
+    `You are the ai-tfa-coordinator for cluster ${cluster.cluster_id}.`,
+    `Run the FULL collaborative RCA loop for the representative test.`,
+    `testRunId=${r.testRunId}  testName=${r.testName ?? ""}`,
+    `error_digest: ${r.error_summary ?? "(none)"}`,
+    shared,
+    `Return the structured RCA_OUTPUT for this test.`,
+  ].join("\n");
+}
+
+function siblingPrompt(sibling, repResult, cluster) {
+  return [
+    `You are the ai-tfa-coordinator for a SIBLING of cluster ${cluster.cluster_id}.`,
+    `Pre-seed: the representative resolved as:`,
+    `  root_cause: ${repResult?.root_cause ?? "(representative did not resolve)"}`,
+    `  related_prs: ${JSON.stringify(repResult?.related_prs ?? [])}`,
+    `State this hypothesis on turn 1 and ask TFA to CONFIRM it against THIS test's own logs.`,
+    `If TFA confirms in one turn → done. If it does NOT (NEEDS_INFO/BLOCKED), fall back to the full loop — never blindly inherit.`,
+    `testRunId=${sibling.testRunId}  testName=${sibling.testName ?? ""}`,
+    `error_digest: ${sibling.error_summary ?? "(none)"}`,
+    shared,
+    `Return the structured RCA_OUTPUT for this test.`,
+  ].join("\n");
+}
+
+log(`Auto-mode batch: ${clusters.length} cluster(s) over build ${ctx.buildId ?? "?"}`);
+
+// Pipeline: each cluster flows representative → siblings independently (no barrier
+// between stages), so a small cluster's siblings confirm while a big cluster's
+// representative is still looping. Concurrency is bounded by the workflow runtime
+// (~min(16, cores-2)); config.concurrency (5) is the intended soft target.
+const results = await pipeline(
+  clusters,
+  (cluster) =>
+    agent(repPrompt(cluster), {
+      label: `rep:${cluster.representative.testRunId}`,
+      phase: "Representatives",
+      agentType: "ai-tfa-coordinator",
+      schema: RCA_SCHEMA,
+    }).then((rca) => ({ cluster, rca })),
+  ({ cluster, rca }) =>
+    parallel(
+      (cluster.siblings ?? []).map((sib) => () =>
+        agent(siblingPrompt(sib, rca, cluster), {
+          label: `sib:${sib.testRunId}`,
+          phase: "Siblings",
+          agentType: "ai-tfa-coordinator",
+          schema: RCA_SCHEMA,
+        }),
+      ),
+    ).then((sibs) => ({
+      cluster_id: cluster.cluster_id,
+      representative: rca,
+      siblings: sibs.filter(Boolean),
+    })),
+);
+
+const flat = results.filter(Boolean);
+const all = flat.flatMap((r) => [r.representative, ...(r.siblings ?? [])]).filter(Boolean);
+const byStatus = all.reduce((acc, r) => {
+  acc[r.status] = (acc[r.status] ?? 0) + 1;
+  return acc;
+}, {});
+
+log(`Auto-mode batch complete: ${all.length} test(s) — ${JSON.stringify(byStatus)}`);
+
+return { clusters: flat.length, tests: all.length, byStatus, results: flat };

From 28ebc1d29c2d6501098ea446e105c5f01e97fd18 Mon Sep 17 00:00:00 2001
From: Ruturaj-Browserstack <ruturaj.s@browserstack.com>
Date: Tue, 23 Jun 2026 22:08:45 +0530
Subject: [PATCH 07/12] =?UTF-8?q?feat(rca):=20interactive=20mode=20?=
 =?UTF-8?q?=E2=80=94=20subagents=20with=20user-in-the-loop=20gap-return?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

references/interactive-mode.md specifies the orchestrator loop: spawn
ai-tfa-coordinator subagents 5 at a time; a subagent cannot pause to prompt the
user, so on an evidence gap it ends early with a GAP_OUTPUT carrying resume
handles (threadId+turnId); the orchestrator asks A1, then re-dispatches with
resume= and the answer. Same coordinator as auto — only the gap action differs.
Compact blocks not transcripts (lean main context); partial-first; auto-first/
escalate-the-residue noted.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 agents/ai-tfa-coordinator.md                  | 31 ++++++++--
 skills/rca-build/SKILL.md                     |  7 ++-
 .../rca-build/references/interactive-mode.md  | 56 +++++++++++++++++++
 3 files changed, 86 insertions(+), 8 deletions(-)
 create mode 100644 skills/rca-build/references/interactive-mode.md

diff --git a/agents/ai-tfa-coordinator.md b/agents/ai-tfa-coordinator.md
index e2045fb..4d42f17 100644
--- a/agents/ai-tfa-coordinator.md
+++ b/agents/ai-tfa-coordinator.md
@@ -71,12 +71,33 @@ capability available for that `evidenceType`):
 
 - **auto** → emit an `unavailable` block back to TFA (no user prompt). TFA
   finalizes best-effort with lower confidence.
-- **interactive** → **return the gap to the caller** (the main agent), which asks
-  the user (A1) for that data, then feeds the answer back. A subagent cannot
-  prompt the user itself.
+- **interactive** → a subagent cannot pause to prompt the user, so **end the run
+  early and return a `GAP_OUTPUT` block** (status `PENDING`) carrying the resume
+  handles + the gap. The orchestrator asks A1, then **re-dispatches a coordinator
+  with `resume={threadId, turnId}`** and the answer digested into the next turn.
+  See `references/interactive-mode.md`.
 
-Everything else — the loop, routing, digest, caps, output — is identical across
-modes. Do not fork the loop; only the gap action differs.
+`GAP_OUTPUT` block (interactive gap only):
+
+```
+GAP_OUTPUT_START
+## testRunId
+<integer>
+## thread_id
+<threadId>
+## turn_id
+<turnId>            # resume handle
+## gap
+- evidenceType: <type>
+- what: <verbatim ask `what`>
+- why: <verbatim ask `why`>
+GAP_OUTPUT_END
+```
+
+Everything else — the loop, routing, digest, caps, terminal output — is identical
+across modes. Do not fork the loop; only the gap action differs. When all gaps in
+a turn are resolvable (gathered or user-answered), the loop proceeds normally to a
+terminal `RCA_OUTPUT`.
 
 ## The loop
 
diff --git a/skills/rca-build/SKILL.md b/skills/rca-build/SKILL.md
index 4bf6b34..5248bee 100644
--- a/skills/rca-build/SKILL.md
+++ b/skills/rca-build/SKILL.md
@@ -92,9 +92,10 @@ representatives deep, siblings one-turn-confirm. Eagerly persist to the CSV/WAL
 - **auto** → run the dynamic workflow `workflows/rca-batch.mjs` (script-orchestrated,
   no user input; gap → "unavailable" back to TFA → best-effort finalize).
 - **interactive** → spawn `ai-tfa-coordinator` subagents 5 at a time; on an
-  evidence gap a subagent returns the gap to this orchestrator, which asks the
-  user (A1), then feeds the answer back. Subagents return compact `RCA_OUTPUT`
-  blocks, not transcripts (keeps the main context lean for large batches).
+  evidence gap a subagent ends early with a `GAP_OUTPUT` (resume handles), and
+  this orchestrator asks the user (A1) then re-dispatches with `resume=`. Subagents
+  return compact blocks, not transcripts (keeps the main context lean for large
+  batches). Full protocol: `references/interactive-mode.md`.
 
 Both modes use the **same** `ai-tfa-coordinator`; only the injected gap-resolver
 differs. A coordinator that dies becomes a recorded `failed` row — one stuck test
diff --git a/skills/rca-build/references/interactive-mode.md b/skills/rca-build/references/interactive-mode.md
new file mode 100644
index 0000000..493ea65
--- /dev/null
+++ b/skills/rca-build/references/interactive-mode.md
@@ -0,0 +1,56 @@
+# Interactive mode — subagents with a user in the loop
+
+Interactive mode (D2) puts the human (A1) in the loop **only at the orchestrator
+layer**. The main session spawns `ai-tfa-coordinator` subagents to investigate in
+parallel; when a subagent needs evidence it can't get, it hands the gap back up to
+the orchestrator, which asks the user and feeds the answer down.
+
+This is the **same coordinator** the auto workflow uses — only the gap-resolver
+differs (auto → "unavailable"; interactive → return the gap).
+
+## Why a subagent can't just "ask the user"
+
+A dispatched subagent runs to completion and returns one final message — it
+cannot pause mid-run, prompt the user, and resume. So the gap-return is modeled
+as **early termination with resume handles**, and the orchestrator drives the
+ask-and-resume loop.
+
+## The orchestrator loop (per batch of ≤ `concurrency`, default 5)
+
+```
+1. Take the next ≤5 pending work items (representatives first, then siblings).
+2. Dispatch one ai-tfa-coordinator subagent per item, mode=interactive, passing
+   the manifest + pre-computed build evidence + (for siblings) the pre-seed.
+3. Each subagent runs its loop until either:
+     - a terminal status → returns RCA_OUTPUT (the orchestrator flips the CSV row), or
+     - an interactive GAP → returns GAP_OUTPUT (status=PENDING) carrying:
+         { testRunId, threadId, turnId, gap: { evidenceType, what, why } }
+4. For each GAP_OUTPUT: ASK A1 for that evidence (one focused question).
+     - A1 answers → re-dispatch a coordinator with resume={threadId,turnId} and
+       the answer digested into the next turn's message. Continue its loop.
+     - A1 has nothing → tell the coordinator to report "unavailable" on resume
+       (degrade exactly like auto for that one ask).
+5. Repeat until every row is terminal. Then dispatch the next batch.
+```
+
+## Aggregation discipline (large batches)
+
+Subagents return **compact `RCA_OUTPUT` / `GAP_OUTPUT` blocks, never transcripts**
+— mirroring the auto workflow's "results in script vars" rule — so the main
+agent's context stays lean even over hundreds of tests. The orchestrator never
+holds full per-test loop transcripts; it holds one block per test.
+
+## Partial-first
+
+A subagent that dies becomes a recorded `failed` row (the orchestrator
+synthesizes it). One stuck test never sinks the batch — same contract as auto.
+
+## When to prefer interactive over auto
+
+- The client is missing infra skills the failures clearly need (k8s/kibana), and
+  the user can supply that evidence by hand.
+- The user wants to steer or sign off mid-run.
+
+Otherwise auto is cheaper (no human round-trips). Both write the same CSV rows
+and the same report, so a run can start auto and the residual BLOCKED/gap tests
+can be re-run interactively (the auto-first / escalate-the-residue pattern).

From e8b70e644225bc448fc47a0508b61223456cf07a Mon Sep 17 00:00:00 2001
From: Ruturaj-Browserstack <ruturaj.s@browserstack.com>
Date: Tue, 23 Jun 2026 22:09:44 +0530
Subject: [PATCH 08/12] feat(rca): suspect-PR falsification packet +
 GitHub-evidence spec
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

references/github-evidence.md specifies exactly what each github ask needs
(diff-since-baseline, PRs-in-window touching the failing path, blame, deploy
timing) and the discovery order GitHub MCP → gh → degrade — no shipped forensics
harness. Adds the adversarial falsification protocol (path overlap / deploy-state
guard / direction) so only verdict:supported suspects enter related_prs; ruled-out
suspects stay as disconfirming evidence. Coordinator runs it for product_code/
deploy/ci asks, reusing the pre-computed build evidence.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 agents/ai-tfa-coordinator.md                  | 12 +++
 .../rca-build/references/github-evidence.md   | 77 +++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 skills/rca-build/references/github-evidence.md

diff --git a/agents/ai-tfa-coordinator.md b/agents/ai-tfa-coordinator.md
index 4d42f17..60b9753 100644
--- a/agents/ai-tfa-coordinator.md
+++ b/agents/ai-tfa-coordinator.md
@@ -99,6 +99,18 @@ across modes. Do not fork the loop; only the gap action differs. When all gaps i
 a turn are resolvable (gathered or user-answered), the loop proceeds normally to a
 terminal `RCA_OUTPUT`.
 
+## Suspect-PR falsification (github asks)
+
+For `product_code` / `deploy` / `ci` asks, follow `references/github-evidence.md`:
+gather the **exact** evidence (diff-since-baseline, PRs-in-window touching the
+failing path, blame, deploy timing) via **GitHub MCP → `gh` → degrade**, and for
+each candidate suspect **try to disprove it** (path overlap? shipped before the
+failure window? behind an OFF flag?). Feed both supporting *and* disconfirming
+evidence back as a structured suspect packet; only `verdict: supported` suspects
+belong in `related_prs`. Reuse the pre-computed build-level evidence — do not
+re-fetch per test. Never fabricate a PR when the github capability is unavailable
+— emit an `unavailable` block.
+
 ## The loop
 
 ```
diff --git a/skills/rca-build/references/github-evidence.md b/skills/rca-build/references/github-evidence.md
new file mode 100644
index 0000000..fa24aaa
--- /dev/null
+++ b/skills/rca-build/references/github-evidence.md
@@ -0,0 +1,77 @@
+# GitHub evidence — what to gather, and how to rule a suspect OUT
+
+The worst automated-RCA outcome is **confidently blaming an innocent PR**. This
+file is the contract for `product_code` / `deploy` / `ci` asks (the `github`
+capability): the **exact** evidence to gather, and a **falsification protocol**
+that tries to *disprove* each suspect before it enters `related_prs`.
+
+> We do **not** ship a GitHub forensics harness or MCP tool. We specify what's
+> needed and use whatever the client already has — **GitHub MCP if available,
+> else `gh`, else degrade** to an `unavailable` block.
+
+## Capability discovery (in order)
+
+1. **GitHub MCP** (`mcp__github__*`) — preferred for structured PR/diff/blame queries.
+2. **`gh` CLI** — fall back for git-graph operations (`gh pr list --search`,
+   `gh api`, `merge-base`, ancestry) and anything the MCP doesn't cover.
+3. **Neither** → emit an `unavailable` block for the ask (do not fabricate a PR).
+
+The orchestrator records which is present in the capability manifest
+(`capability: github → { available, via }`); route every github ask against it.
+
+## Evidence each ask needs (be specific — no fishing)
+
+| Ask intent | Gather exactly |
+|---|---|
+| "Did `<X>` change since the last passing run?" | the diff of `<X>`'s file/function between the **baseline ref** (last-green, or the configured fallback) and the build's commit — not the whole repo diff |
+| "Which PRs are suspect?" | PRs **merged in the window** `(baselineRef, build commit]` that **touch the failing code path** — intersect changed files with the failing file/function |
+| "Who/what last changed the failing line?" | `blame` on the specific failing lines (from the test's `file_path` + the error) |
+| "What shipped to the run's env before the failure?" | deploy timeline (`gh` releases/tags + the env's deploy record); compare deploy time vs. the run's `started_at` |
+| "Did CI change?" | the workflow-file diff + recent `gh run` history for the failing job |
+
+Scope everything by the failing test's `file_path` + the error summary. The
+build-level evidence (diff-since-last-green, PR window) is **pre-computed once**
+and passed in — reuse it; do not re-fetch per test.
+
+## Falsification protocol — rule out, don't just rule in
+
+For **each** candidate suspect PR, try to **break** the hypothesis:
+
+1. **Path overlap.** Do the PR's changed hunks actually touch the failing code
+   path (the function/line in the stack)? No overlap → **ruled out**.
+2. **Deployment-state guard.** Was the PR's code actually **live** in the run's
+   env at `started_at`? If it shipped *after* the failure window, or sits behind
+   an **OFF** flag, it could not have caused this failure → **ruled out**.
+3. **Direction.** Does the change plausibly produce *this* error (e.g. a validator
+   tightened to reject the input the test sends)? If the change is unrelated to
+   the symptom → **weak**, mark accordingly.
+
+Feed **both supporting and disconfirming** evidence back to TFA. A suspect that
+survives 1–3 is a real candidate; one that fails any is reported as ruled-out
+(with the reason), **not** dropped silently.
+
+## The suspect packet (structured, not free text)
+
+Each surviving/ruled-out suspect is one structured block so `related_prs`
+populates deterministically:
+
+```
+SUSPECT:
+  pr: <#number>
+  files: <changed files overlapping the failing path>
+  hunks: <the 1-3 load-bearing changed hunks — see digest size caps>
+  author: <login>
+  merged_at: <ts>   vs   last_green: <ts>   vs   started_at: <ts>
+  verdict: supported | ruled-out (<reason: no-path-overlap | shipped-after | behind-off-flag | unrelated>)
+  link: <PR permalink>
+```
+
+Only `verdict: supported` suspects should end up in TFA's `related_prs`. Ruled-out
+suspects stay in the thread as disconfirming evidence so TFA (and a human) can see
+the elimination, not just the conclusion.
+
+## Digest discipline
+
+Same caps as `references/evidence-routing.md`: prefer a PR **link** over pasting a
+diff; at most 1 hunk (3 hard) per `product_code` snippet; never paste a full diff.
+The packet is *findings*, not the haystack.

From c5a4e9ecb219a3fd52ca7bdecc0365069f543b6d Mon Sep 17 00:00:00 2001
From: Ruturaj-Browserstack <ruturaj.s@browserstack.com>
Date: Tue, 23 Jun 2026 22:11:34 +0530
Subject: [PATCH 09/12] feat(rca): coverage stamp + degrade-don't-crash report
 (resume reaper in U4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

lib/coverage.mjs derives a per-row evidence-coverage band — TFA confidence capped
by coverage (full keeps it, partial→medium, thin→low) so a RESOLVED built with
evidence unavailable reads as lower confidence BECAUSE of the gap. lib/report.mjs
renders the CSV to markdown: status counts + per-test table + coverage caveats,
degrading missing fields to 'not available' and never crashing on an empty/partial
batch. report-format.md documents the stamp, layout, and the startup reaper resume
path. Blast-radius digest explicitly deferred.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 lib/coverage.mjs                             |  39 +++++++
 lib/report.mjs                               |  69 ++++++++++++
 skills/rca-build/references/report-format.md |  45 ++++++++
 tests/coverage-report.test.mjs               | 108 +++++++++++++++++++
 4 files changed, 261 insertions(+)
 create mode 100644 lib/coverage.mjs
 create mode 100644 lib/report.mjs
 create mode 100644 skills/rca-build/references/report-format.md
 create mode 100644 tests/coverage-report.test.mjs

diff --git a/lib/coverage.mjs b/lib/coverage.mjs
new file mode 100644
index 0000000..caff2d2
--- /dev/null
+++ b/lib/coverage.mjs
@@ -0,0 +1,39 @@
+// Evidence-coverage stamp (ideation #6, v1 — the per-row coverage band; the
+// build-level blast-radius digest is deferred). A RESOLVED RCA built with
+// k8s+kibana+metrics all "unavailable" must not read like one with full
+// evidence. The client (which routed every ask) stamps each row with a coverage
+// vector and derives a coverage-capped confidence band the reviewer sees:
+// "low confidence BECAUSE kibana was unavailable", not "low confidence, trust me".
+
+const BAND_ORDER = ["low", "medium", "high"];
+
+// coverage classification from what was fulfilled vs. left unavailable.
+export function classifyCoverage(asksFulfilled = [], asksUnavailable = []) {
+  const unavailable = [...new Set(asksUnavailable.filter(Boolean))];
+  const fulfilled = [...new Set(asksFulfilled.filter(Boolean))];
+  if (unavailable.length === 0) return "full";
+  if (fulfilled.length > 0) return "partial";
+  return "thin";
+}
+
+// Cap the band: full coverage keeps TFA's confidence; partial caps at medium;
+// thin caps at low. Unknown/absent TFA confidence floors to low.
+function capBand(tfaConfidence, coverage) {
+  const base = BAND_ORDER.includes(tfaConfidence) ? tfaConfidence : "low";
+  const cap = coverage === "full" ? "high" : coverage === "partial" ? "medium" : "low";
+  return BAND_ORDER[Math.min(BAND_ORDER.indexOf(base), BAND_ORDER.indexOf(cap))];
+}
+
+// The stamp written to a row at flip time. Returns { coverage, band, unavailable }.
+export function coverageStamp({
+  asksFulfilled = [],
+  asksUnavailable = [],
+  tfaConfidence = "unknown",
+} = {}) {
+  const coverage = classifyCoverage(asksFulfilled, asksUnavailable);
+  return {
+    coverage,
+    band: capBand(tfaConfidence, coverage),
+    unavailable: [...new Set(asksUnavailable.filter(Boolean))],
+  };
+}
diff --git a/lib/report.mjs b/lib/report.mjs
new file mode 100644
index 0000000..84ae275
--- /dev/null
+++ b/lib/report.mjs
@@ -0,0 +1,69 @@
+// Deterministic markdown report for a finished (or partial) batch. Degrade,
+// don't crash: any missing field renders as "not available"; an empty batch
+// still renders a valid report. Reads the CSV/WAL spine; no per-test transcripts.
+
+import { readRows } from "./csv-state.mjs";
+
+const NA = "not available";
+
+function cell(value) {
+  const s = value == null ? "" : String(value).trim();
+  if (s === "") return NA;
+  // keep the table one-line-per-row: collapse newlines, escape pipes
+  return s.replace(/\s*\n\s*/g, " ").replace(/\|/g, "\\|");
+}
+
+function countBy(rows, key) {
+  return rows.reduce((acc, r) => {
+    const k = r[key] || "unknown";
+    acc[k] = (acc[k] ?? 0) + 1;
+    return acc;
+  }, {});
+}
+
+// Render from a rows array (testable) — or pass a csvPath via renderReportFromCsv.
+export function renderReport(rows, { buildId, generatedAt } = {}) {
+  const lines = [];
+  lines.push(`# RCA report${buildId ? ` — build ${buildId}` : ""}`);
+  if (generatedAt) lines.push(`\nGenerated: ${generatedAt}`);
+
+  if (!rows || rows.length === 0) {
+    lines.push("\nNo failed tests analyzed.");
+    return lines.join("\n") + "\n";
+  }
+
+  const byState = countBy(rows, "rca_done");
+  const summary = Object.entries(byState)
+    .map(([k, v]) => `${k}: ${v}`)
+    .join(" · ");
+  lines.push(`\n**${rows.length} test(s)** — ${summary}\n`);
+
+  lines.push(
+    "| testRunId | test | status | confidence | coverage | root cause | related PRs |",
+  );
+  lines.push("|---|---|---|---|---|---|---|");
+  for (const r of rows) {
+    lines.push(
+      `| ${cell(r.testRunId)} | ${cell(r.testName)} | ${cell(r.rca_done)} | ${cell(
+        r.confidence,
+      )} | ${cell(r.coverage)} | ${cell(r.root_cause)} | ${cell(r.related_prs)} |`,
+    );
+  }
+
+  // Surface coverage caveats so a "low confidence" reads as "because X unavailable".
+  const thin = rows.filter((r) => r.coverage === "thin" || r.coverage === "partial");
+  if (thin.length > 0) {
+    lines.push(`\n## Coverage caveats`);
+    for (const r of thin) {
+      lines.push(
+        `- ${cell(r.testRunId)} (${cell(r.coverage)} coverage): confidence band reflects evidence that was unavailable, not just model certainty.`,
+      );
+    }
+  }
+
+  return lines.join("\n") + "\n";
+}
+
+export function renderReportFromCsv(csvPath, opts = {}) {
+  return renderReport(readRows(csvPath), opts);
+}
diff --git a/skills/rca-build/references/report-format.md b/skills/rca-build/references/report-format.md
new file mode 100644
index 0000000..e27a28d
--- /dev/null
+++ b/skills/rca-build/references/report-format.md
@@ -0,0 +1,45 @@
+# Report format, coverage stamp, and resume
+
+## The CSV is the source of truth
+
+Every per-test result lives as one CSV row (`lib/csv-state.mjs`, columns in
+`COLUMNS`). The report is a deterministic render of that CSV — no per-test
+transcripts are kept. `rca_done` ∈ `pending | resolved | blocked | failed |
+pending-resume`.
+
+## Coverage stamp (ideation #6, v1)
+
+At flip time the orchestrator stamps each row (`lib/coverage.mjs`) from the
+coordinator's `asks_fulfilled` / `asks_unavailable` + TFA's confidence:
+
+- **coverage** — `full` (no gaps) · `partial` (some fulfilled, some unavailable) ·
+  `thin` (nothing fulfilled, only gaps).
+- **band** — TFA's confidence **capped by coverage**: `full` keeps it, `partial`
+  caps at `medium`, `thin` caps at `low`; unknown floors to `low`.
+
+So a RESOLVED with kibana/k8s unavailable reads as a lower band *because* evidence
+was missing — not the same as a fully-evidenced RESOLVED. The report's **Coverage
+caveats** section spells this out per affected row.
+
+> Out of v1 scope: the build-level **blast-radius digest** (rows inverted by
+> culprit PR, ranked) — deferred to follow-up. The per-row coverage stamp ships now.
+
+## Report layout (`lib/report.mjs` → `renderReport`)
+
+- Header + build id + generated-at.
+- One-line summary: total + counts by `rca_done`.
+- A per-test table: `testRunId | test | status | confidence | coverage | root cause | related PRs`.
+- A **Coverage caveats** list for `partial`/`thin` rows.
+
+**Degrade, don't crash:** any missing field renders as `not available`; an empty
+batch renders "No failed tests analyzed."; pipes are escaped and newlines
+collapsed so the table never breaks.
+
+## Resume (ideation #7)
+
+On startup the orchestrator runs the **reaper** (`lib/csv-state.mjs` → `reaper`):
+rows stuck `in_flight` with a heartbeat older than `reaperHeartbeatTtlSec` are
+reclaimed to `pending` (a crashed worker's rows), then fan-out re-points at the
+CSV. A row that retains a live `threadId`/`turnId` resumes that TFA thread; a dead
+thread re-runs from `pending`. In-session / in-workspace only — cross-session
+durability is deferred.
diff --git a/tests/coverage-report.test.mjs b/tests/coverage-report.test.mjs
new file mode 100644
index 0000000..7d7cb94
--- /dev/null
+++ b/tests/coverage-report.test.mjs
@@ -0,0 +1,108 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { coverageStamp, classifyCoverage } from "../lib/coverage.mjs";
+import { renderReport } from "../lib/report.mjs";
+
+// ---- coverage stamp --------------------------------------------------------
+
+test("full coverage keeps TFA confidence", () => {
+  const s = coverageStamp({
+    asksFulfilled: ["product_code"],
+    asksUnavailable: [],
+    tfaConfidence: "high",
+  });
+  assert.equal(s.coverage, "full");
+  assert.equal(s.band, "high");
+});
+
+test("partial coverage caps a high TFA confidence at medium", () => {
+  const s = coverageStamp({
+    asksFulfilled: ["product_code"],
+    asksUnavailable: ["kibana"],
+    tfaConfidence: "high",
+  });
+  assert.equal(s.coverage, "partial");
+  assert.equal(s.band, "medium");
+  assert.deepEqual(s.unavailable, ["kibana"]);
+});
+
+test("thin coverage (nothing fulfilled, gaps) caps at low", () => {
+  const s = coverageStamp({
+    asksFulfilled: [],
+    asksUnavailable: ["k8s", "metrics"],
+    tfaConfidence: "high",
+  });
+  assert.equal(s.coverage, "thin");
+  assert.equal(s.band, "low");
+});
+
+test("unknown TFA confidence floors to low even at full coverage", () => {
+  const s = coverageStamp({ asksFulfilled: [], asksUnavailable: [], tfaConfidence: "unknown" });
+  assert.equal(s.coverage, "full");
+  assert.equal(s.band, "low");
+});
+
+test("classifyCoverage dedupes and handles empties", () => {
+  assert.equal(classifyCoverage(["a", "a"], []), "full");
+  assert.equal(classifyCoverage([], ["x"]), "thin");
+});
+
+// ---- report ----------------------------------------------------------------
+
+test("empty batch renders a valid report, no crash", () => {
+  const md = renderReport([], { buildId: "b1" });
+  assert.match(md, /No failed tests analyzed/);
+});
+
+test("report renders a row table with status counts", () => {
+  const rows = [
+    {
+      testRunId: "101",
+      testName: "login",
+      rca_done: "resolved",
+      confidence: "high",
+      coverage: "full",
+      root_cause: "PR #7421 tightened validator",
+      related_prs: "#7421",
+    },
+    {
+      testRunId: "102",
+      testName: "checkout",
+      rca_done: "blocked",
+      confidence: "",
+      coverage: "",
+      root_cause: "",
+      related_prs: "",
+    },
+  ];
+  const md = renderReport(rows, { buildId: "b1" });
+  assert.match(md, /2 test\(s\)/);
+  assert.match(md, /resolved: 1/);
+  assert.match(md, /blocked: 1/);
+  assert.match(md, /101/);
+  assert.match(md, /not available/); // 102's blank fields degrade
+});
+
+test("report escapes pipes and collapses newlines in cells", () => {
+  const rows = [
+    {
+      testRunId: "1",
+      testName: "t",
+      rca_done: "resolved",
+      root_cause: "a | b\nsecond line",
+      related_prs: "#1",
+    },
+  ];
+  const md = renderReport(rows);
+  assert.ok(!md.includes("a | b\nsecond"));
+  assert.match(md, /a \\\| b second line/);
+});
+
+test("report surfaces coverage caveats for thin/partial rows", () => {
+  const rows = [
+    { testRunId: "1", testName: "t", rca_done: "resolved", coverage: "partial" },
+  ];
+  const md = renderReport(rows);
+  assert.match(md, /Coverage caveats/);
+  assert.match(md, /confidence band reflects evidence that was unavailable/);
+});

From e277811ecbdd58fc3bdbf18732615560a7cc26cb Mon Sep 17 00:00:00 2001
From: Ruturaj-Browserstack <ruturaj.s@browserstack.com>
Date: Tue, 23 Jun 2026 22:14:25 +0530
Subject: [PATCH 10/12] feat(rca): conformance fixture + executable loop mirror
 / sequential harness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

lib/loop.mjs (runRcaLoop) is an executable mirror of the coordinator loop —
status branching, ask routing, gap resolution, turn-cap, one-thread, soft-PENDING
— driven by an injected submit(). It doubles as the D5 sequential thin-client
harness. tests/conformance.test.mjs replays recorded tfaRcaTurn transcripts
(resolved/blocked/pending/turn-cap fixtures) and proves: rca capture, test_logs
skip, soft-PENDING no-re-poll, turn-cap never submits a 7th turn, and the degraded
(no-capability auto) path still reaches a valid terminal RCA — same loop, same
result. 48 tests green.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 agents/ai-tfa-coordinator.md                |   6 +
 lib/loop.mjs                                | 125 ++++++++++++++++++
 tests/conformance.test.mjs                  | 133 ++++++++++++++++++++
 tests/fixtures/recorded-turns/blocked.json  |  13 ++
 tests/fixtures/recorded-turns/pending.json  |  12 ++
 tests/fixtures/recorded-turns/resolved.json |  37 ++++++
 tests/fixtures/recorded-turns/turn-cap.json |  12 ++
 7 files changed, 338 insertions(+)
 create mode 100644 lib/loop.mjs
 create mode 100644 tests/conformance.test.mjs
 create mode 100644 tests/fixtures/recorded-turns/blocked.json
 create mode 100644 tests/fixtures/recorded-turns/pending.json
 create mode 100644 tests/fixtures/recorded-turns/resolved.json
 create mode 100644 tests/fixtures/recorded-turns/turn-cap.json

diff --git a/agents/ai-tfa-coordinator.md b/agents/ai-tfa-coordinator.md
index 60b9753..8f5cb4f 100644
--- a/agents/ai-tfa-coordinator.md
+++ b/agents/ai-tfa-coordinator.md
@@ -139,6 +139,12 @@ re-fetch per test. Never fabricate a PR when the github capability is unavailabl
 6. EMIT the RCA_OUTPUT block from the captured terminal state.
 ```
 
+> The loop mechanics above have an **executable mirror** in `lib/loop.mjs`
+> (`runRcaLoop`) — conformance-tested against recorded `tfaRcaTurn` transcripts
+> (`tests/conformance.test.mjs`). It also serves as the **sequential thin-client
+> harness** (D5): MCP clients without workflows/subagents drive the same contract
+> by calling `runRcaLoop` with a real `submit` bound to `tfaRcaTurn`.
+
 **Sibling confirm (cluster member).** When `pre_seed` is present the first turn
 states the representative's hypothesis and asks TFA to confirm against this
 test's own logs. If TFA `RESOLVED`s in one turn → a logs-grounded per-test RCA at
diff --git a/lib/loop.mjs b/lib/loop.mjs
new file mode 100644
index 0000000..1d1728d
--- /dev/null
+++ b/lib/loop.mjs
@@ -0,0 +1,125 @@
+// Executable mirror of the ai-tfa-coordinator loop (agents/ai-tfa-coordinator.md).
+// It drives the collaborative loop against an injected `submit` (real = the
+// tfaRcaTurn MCP tool; tests = a recorded-turn replayer), so the loop mechanics —
+// status branching, ask routing, gap resolution, turn-cap, one-thread,
+// soft-PENDING — are tested rather than assumed.
+//
+// Double duty: this is ALSO the **sequential thin-client harness** (D5 / ideation
+// #4) — the third caller of the same contract, for MCP clients without
+// workflows/subagents. Pure + dependency-light (imports only the routing registry).
+
+import { routeAsks } from "./routing.mjs";
+
+function unavailableBlock(gap) {
+  const what = gap?.ask?.what ?? "";
+  return [
+    `ASK: ${what}`,
+    `TYPE: ${gap.evidenceType}`,
+    `FOUND: no`,
+    `SUMMARY: unavailable — no ${gap.capability} capability for this client.`,
+  ].join("\n");
+}
+
+// runRcaLoop drives one test to a terminal RCA_OUTPUT object.
+//
+//   submit({ testRunId, message, threadId, turnId }) → Promise<turn>   (tfaRcaTurn shape)
+//   gather(routedGatherEntry) → Promise<string>                        (one digest block)
+//   resolveGap(routedGapEntry) → Promise<{ digest } | null>            (auto: null; interactive: a digest)
+export async function runRcaLoop({
+  testRunId,
+  firstMessage = "",
+  submit,
+  config = {},
+  manifest = {},
+  gather = async () => "",
+  resolveGap = async () => null,
+  turnCap = config?.turnCap ?? 6,
+}) {
+  if (testRunId == null || Number.isNaN(Number(testRunId))) {
+    return {
+      testRunId: String(testRunId),
+      status: "failed",
+      root_cause: "no testRunId provided",
+      turns_used: 0,
+      asks_fulfilled: [],
+      asks_skipped: [],
+      asks_unavailable: [],
+    };
+  }
+
+  let threadId;
+  let turnId;
+  let turns = 0;
+  let message = firstMessage;
+  const fulfilled = new Set();
+  const skipped = new Set();
+  const unavailable = new Set();
+
+  const out = (status, turn, note) => {
+    const rca = turn?.rca ?? {};
+    return {
+      testRunId: String(testRunId),
+      status,
+      confidence: turn?.confidence ?? "unknown",
+      root_cause:
+        status === "RESOLVED"
+          ? (rca.root_cause ?? "")
+          : status === "BLOCKED"
+            ? (turn?.reason ?? "")
+            : (note ?? ""),
+      possible_fix: rca.possible_fix ?? "",
+      related_prs: rca.related_prs ?? [],
+      threadId: threadId ?? null,
+      turnId: turnId ?? null,
+      turns_used: turns,
+      asks_fulfilled: [...fulfilled],
+      asks_skipped: [...skipped],
+      asks_unavailable: [...unavailable],
+    };
+  };
+
+  while (true) {
+    turns++;
+    const turn = await submit({ testRunId, message, threadId, turnId });
+    threadId = turn.threadId ?? threadId;
+
+    if (turn.status === "RESOLVED") return out("RESOLVED", turn);
+    if (turn.status === "BLOCKED") return out("BLOCKED", turn);
+    if (turn.status === "PENDING") {
+      turnId = turn.turnId ?? turnId;
+      return out("PENDING", turn, "soft-pending");
+    }
+
+    // NEEDS_INFO: route + fulfill.
+    const buckets = routeAsks(turn.asks ?? [], config, manifest);
+    const blocks = [];
+    for (const s of buckets.skip) skipped.add(s.evidenceType);
+    for (const g of buckets.gather) {
+      blocks.push(await gather(g));
+      fulfilled.add(g.evidenceType);
+    }
+    for (const gap of buckets.gap) {
+      const resolved = await resolveGap(gap);
+      if (resolved && resolved.digest) {
+        blocks.push(resolved.digest);
+        fulfilled.add(gap.evidenceType);
+      } else {
+        unavailable.add(gap.evidenceType);
+        blocks.push(unavailableBlock(gap));
+      }
+    }
+
+    if (turns >= turnCap) return out("PENDING", turn, "turn-cap");
+    message = blocks.join("\n\n");
+  }
+}
+
+// Replay helper for tests: returns a submit() that yields recorded turns in order.
+export function replaySubmit(turns) {
+  let i = 0;
+  return async () => {
+    const turn = turns[Math.min(i, turns.length - 1)];
+    i++;
+    return turn;
+  };
+}
diff --git a/tests/conformance.test.mjs b/tests/conformance.test.mjs
new file mode 100644
index 0000000..5ea6192
--- /dev/null
+++ b/tests/conformance.test.mjs
@@ -0,0 +1,133 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { readFileSync } from "node:fs";
+import { fileURLToPath } from "node:url";
+import { dirname, join } from "node:path";
+import { runRcaLoop, replaySubmit } from "../lib/loop.mjs";
+
+const here = dirname(fileURLToPath(import.meta.url));
+const load = (name) =>
+  JSON.parse(readFileSync(join(here, "fixtures", "recorded-turns", name), "utf8"));
+
+const CONFIG = {
+  turnCap: 6,
+  evidenceRouting: {
+    test_logs: { owner: "tfa", skip: true },
+    product_code: { capability: "github" },
+    other: { capability: "other" },
+  },
+};
+const GITHUB_AVAILABLE = { github: { available: true, via: "gh" } };
+
+// A coordinator gather() stub: returns a one-line digest block.
+const gather = async (g) => `ASK: ${g.ask.what}\nTYPE: ${g.evidenceType}\nFOUND: yes\nSUMMARY: stub`;
+
+test("resolved fixture: NEEDS_INFO → evidence → RESOLVED, rca captured, test_logs skipped", async () => {
+  const fx = load("resolved.json");
+  const result = await runRcaLoop({
+    testRunId: fx.testRunId,
+    firstMessage: "Error: empty buildName",
+    submit: replaySubmit(fx.turns),
+    config: CONFIG,
+    manifest: GITHUB_AVAILABLE,
+    gather,
+  });
+  assert.equal(result.status, "RESOLVED");
+  assert.match(result.root_cause, /#7421/);
+  assert.deepEqual(result.related_prs, ["#7421"]);
+  assert.deepEqual(result.asks_fulfilled, ["product_code"]);
+  assert.deepEqual(result.asks_skipped, ["test_logs"]); // TFA-owned, never gathered
+  assert.equal(result.turns_used, 2);
+  assert.equal(result.threadId, "thr-39");
+});
+
+test("blocked fixture: terminal with reason captured", async () => {
+  const fx = load("blocked.json");
+  const result = await runRcaLoop({
+    testRunId: fx.testRunId,
+    submit: replaySubmit(fx.turns),
+    config: CONFIG,
+  });
+  assert.equal(result.status, "BLOCKED");
+  assert.match(result.root_cause, /could not obtain server-side logs/);
+});
+
+test("pending fixture: soft-PENDING ends with turnId, no re-poll", async () => {
+  const fx = load("pending.json");
+  let calls = 0;
+  const counting = async (args) => {
+    calls++;
+    return replaySubmit(fx.turns)(args);
+  };
+  const result = await runRcaLoop({
+    testRunId: fx.testRunId,
+    submit: counting,
+    config: CONFIG,
+  });
+  assert.equal(result.status, "PENDING");
+  assert.equal(result.turnId, "turn-81-1");
+  assert.equal(calls, 1); // ended immediately, did not poll again
+});
+
+test("turn-cap fixture: ends PENDING(turn-cap) at the cap, never a 7th submit", async () => {
+  const fx = load("turn-cap.json");
+  let submits = 0;
+  const counting = async (args) => {
+    submits++;
+    return replaySubmit(fx.turns)(args);
+  };
+  const result = await runRcaLoop({
+    testRunId: fx.testRunId,
+    submit: counting,
+    config: CONFIG,
+    manifest: GITHUB_AVAILABLE,
+    gather,
+  });
+  assert.equal(result.status, "PENDING");
+  assert.equal(result.root_cause, "turn-cap");
+  assert.equal(submits, 6); // capped at turnCap, never 7
+});
+
+test("degraded path: no capability + auto resolveGap → asks_unavailable, still terminal", async () => {
+  // Same resolved fixture, but the client has NO github capability and runs auto
+  // (resolveGap returns null → 'unavailable'). The loop must still reach RESOLVED.
+  const fx = load("resolved.json");
+  const result = await runRcaLoop({
+    testRunId: fx.testRunId,
+    submit: replaySubmit(fx.turns),
+    config: CONFIG,
+    manifest: {}, // nothing available
+    resolveGap: async () => null, // auto: report unavailable
+  });
+  assert.equal(result.status, "RESOLVED");
+  assert.deepEqual(result.asks_unavailable, ["product_code"]);
+  assert.deepEqual(result.asks_fulfilled, []);
+});
+
+test("interactive resolveGap supplies the missing evidence → fulfilled, not unavailable", async () => {
+  const fx = load("resolved.json");
+  const result = await runRcaLoop({
+    testRunId: fx.testRunId,
+    submit: replaySubmit(fx.turns),
+    config: CONFIG,
+    manifest: {},
+    resolveGap: async () => ({ digest: "ASK: ...\nFOUND: yes\nSUMMARY: user supplied" }),
+  });
+  assert.equal(result.status, "RESOLVED");
+  assert.deepEqual(result.asks_fulfilled, ["product_code"]);
+  assert.deepEqual(result.asks_unavailable, []);
+});
+
+test("no testRunId → failed block, tool never called", async () => {
+  let called = false;
+  const result = await runRcaLoop({
+    testRunId: undefined,
+    submit: async () => {
+      called = true;
+      return {};
+    },
+    config: CONFIG,
+  });
+  assert.equal(result.status, "failed");
+  assert.equal(called, false);
+});
diff --git a/tests/fixtures/recorded-turns/blocked.json b/tests/fixtures/recorded-turns/blocked.json
new file mode 100644
index 0000000..35b5373
--- /dev/null
+++ b/tests/fixtures/recorded-turns/blocked.json
@@ -0,0 +1,13 @@
+{
+  "name": "blocked — unmet asks",
+  "testRunId": 72,
+  "turns": [
+    {
+      "status": "BLOCKED",
+      "confidence": "low",
+      "threadId": "thr-72",
+      "reason": "could not obtain server-side logs; cannot distinguish product bug from env flake",
+      "unmetAsks": ["kibana", "k8s"]
+    }
+  ]
+}
diff --git a/tests/fixtures/recorded-turns/pending.json b/tests/fixtures/recorded-turns/pending.json
new file mode 100644
index 0000000..ec8b16a
--- /dev/null
+++ b/tests/fixtures/recorded-turns/pending.json
@@ -0,0 +1,12 @@
+{
+  "name": "soft-pending — resumable",
+  "testRunId": 81,
+  "turns": [
+    {
+      "status": "PENDING",
+      "confidence": "unknown",
+      "threadId": "thr-81",
+      "turnId": "turn-81-1"
+    }
+  ]
+}
diff --git a/tests/fixtures/recorded-turns/resolved.json b/tests/fixtures/recorded-turns/resolved.json
new file mode 100644
index 0000000..120dad6
--- /dev/null
+++ b/tests/fixtures/recorded-turns/resolved.json
@@ -0,0 +1,37 @@
+{
+  "name": "needs_info → evidence → resolved",
+  "testRunId": 39,
+  "turns": [
+    {
+      "status": "NEEDS_INFO",
+      "confidence": "low",
+      "threadId": "thr-39",
+      "questions": ["Did the buildName validator change?"],
+      "asks": [
+        {
+          "what": "Did request-validation on POST /builds change since last green?",
+          "why": "the failing test posts an empty buildName",
+          "evidenceType": "product_code",
+          "priority": "high"
+        },
+        {
+          "what": "Full run logs for test 39",
+          "why": "to read the failure",
+          "evidenceType": "test_logs",
+          "priority": "high"
+        }
+      ]
+    },
+    {
+      "status": "RESOLVED",
+      "confidence": "high",
+      "threadId": "thr-39",
+      "rca": {
+        "root_cause": "PR #7421 tightened the buildName validator to reject empty strings",
+        "possible_fix": "send a non-empty buildName or relax the validator",
+        "failure_type": "product_regression",
+        "related_prs": ["#7421"]
+      }
+    }
+  ]
+}
diff --git a/tests/fixtures/recorded-turns/turn-cap.json b/tests/fixtures/recorded-turns/turn-cap.json
new file mode 100644
index 0000000..4638b61
--- /dev/null
+++ b/tests/fixtures/recorded-turns/turn-cap.json
@@ -0,0 +1,12 @@
+{
+  "name": "turn-cap — never resolves",
+  "testRunId": 99,
+  "turns": [
+    { "status": "NEEDS_INFO", "confidence": "low", "threadId": "thr-99", "asks": [{ "what": "more", "why": "x", "evidenceType": "product_code", "priority": "high" }] },
+    { "status": "NEEDS_INFO", "confidence": "low", "threadId": "thr-99", "asks": [{ "what": "more", "why": "x", "evidenceType": "product_code", "priority": "high" }] },
+    { "status": "NEEDS_INFO", "confidence": "low", "threadId": "thr-99", "asks": [{ "what": "more", "why": "x", "evidenceType": "product_code", "priority": "high" }] },
+    { "status": "NEEDS_INFO", "confidence": "low", "threadId": "thr-99", "asks": [{ "what": "more", "why": "x", "evidenceType": "product_code", "priority": "high" }] },
+    { "status": "NEEDS_INFO", "confidence": "low", "threadId": "thr-99", "asks": [{ "what": "more", "why": "x", "evidenceType": "product_code", "priority": "high" }] },
+    { "status": "NEEDS_INFO", "confidence": "low", "threadId": "thr-99", "asks": [{ "what": "more", "why": "x", "evidenceType": "product_code", "priority": "high" }] }
+  ]
+}

From e9331afce51c173725c1b129a4d3e664f630e20f Mon Sep 17 00:00:00 2001
From: Ruturaj-Browserstack <ruturaj.s@browserstack.com>
Date: Tue, 23 Jun 2026 22:20:27 +0530
Subject: [PATCH 11/12] fix(rca): make pending-resume resumable, enforce flip
 terminal status, skip turn-cap gather
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Code-review fixes (suggested, non-blocking):
- pending-resume removed from TERMINAL_STATES → soft-PENDING rows are now
  re-claimable, listed by pendingRows, and skipped by the reaper (they cleared
  in_flight), so the retained threadId/turnId actually drive an in-session resume
  instead of being stranded as a permanent non-terminal terminal.
- flip() now rejects a missing/non-terminal rca_done without mutating, so a partial
  flip can't clear the claim yet leave the row pending (duplicate-RCA clobber).
- loop checks the turn-cap BEFORE gathering, so evidence on the never-submitted
  final turn isn't gathered for nothing.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 lib/csv-state.mjs        | 26 ++++++++++++++++++--------
 lib/loop.mjs             |  8 ++++++--
 tests/csv-state.test.mjs | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/lib/csv-state.mjs b/lib/csv-state.mjs
index 499f997..ea830ff 100644
--- a/lib/csv-state.mjs
+++ b/lib/csv-state.mjs
@@ -40,12 +40,13 @@ export const COLUMNS = [
 ];
 
 export const PENDING = "pending";
-const TERMINAL_STATES = new Set([
-  "resolved",
-  "blocked",
-  "failed",
-  "pending-resume",
-]);
+export const RESUMABLE = "pending-resume";
+// Truly done — never re-claimed, listed, or reaped.
+const TERMINAL_STATES = new Set(["resolved", "blocked", "failed"]);
+// Valid outcomes flip() may write. `pending-resume` is a *soft* terminal: this
+// attempt ended (claim cleared) but the row stays resumable — it keeps its
+// threadId/turnId and is picked back up by the next fan-out / resume pass.
+const FLIP_STATES = new Set(["resolved", "blocked", "failed", RESUMABLE]);
 
 // ---- minimal RFC4180-ish CSV codec ----------------------------------------
 
@@ -199,6 +200,11 @@ export function heartbeat(csvPath, testRunId, worker, nowMs) {
 // possible_fix, related_prs, threadId, turnId, coverage, confidence,
 // last_evidence_digest, cluster_id.
 export function flip(csvPath, testRunId, fields, nowMs) {
+  // Enforce the contract: a flip must name a valid outcome. A partial flip with
+  // a missing/non-terminal rca_done would otherwise clear the claim yet leave the
+  // row `pending` — re-exposing it for a duplicate RCA that clobbers this result.
+  // Reject without mutating so the worker keeps its claim and the bug surfaces.
+  if (!FLIP_STATES.has(fields?.rca_done)) return false;
   const rows = readRows(csvPath);
   const row = rows.find((r) => String(r.testRunId) === String(testRunId));
   if (!row) return false;
@@ -233,7 +239,11 @@ export function reaper(csvPath, ttlSec, nowMs) {
   return reclaimed;
 }
 
-// Rows still needing work (pending or reclaimed). The work-list for fan-out.
+// Rows still needing work: fresh/reclaimed `pending` AND `pending-resume` rows
+// (soft-PENDING attempts that retain a threadId/turnId to resume). The fan-out
+// work-list. Truly terminal rows (resolved/blocked/failed) are excluded.
 export function pendingRows(csvPath) {
-  return readRows(csvPath).filter((r) => r.rca_done === PENDING);
+  return readRows(csvPath).filter(
+    (r) => r.rca_done === PENDING || r.rca_done === RESUMABLE,
+  );
 }
diff --git a/lib/loop.mjs b/lib/loop.mjs
index 1d1728d..9e59eb2 100644
--- a/lib/loop.mjs
+++ b/lib/loop.mjs
@@ -90,7 +90,12 @@ export async function runRcaLoop({
       return out("PENDING", turn, "soft-pending");
     }
 
-    // NEEDS_INFO: route + fulfill.
+    // NEEDS_INFO. Check the turn-cap BEFORE gathering — evidence assembled on a
+    // turn we will never submit is wasted work (and a side-effecting gather()
+    // would run for nothing).
+    if (turns >= turnCap) return out("PENDING", turn, "turn-cap");
+
+    // Route + fulfill.
     const buckets = routeAsks(turn.asks ?? [], config, manifest);
     const blocks = [];
     for (const s of buckets.skip) skipped.add(s.evidenceType);
@@ -109,7 +114,6 @@ export async function runRcaLoop({
       }
     }
 
-    if (turns >= turnCap) return out("PENDING", turn, "turn-cap");
     message = blocks.join("\n\n");
   }
 }
diff --git a/tests/csv-state.test.mjs b/tests/csv-state.test.mjs
index 5a9a60f..ca55d84 100644
--- a/tests/csv-state.test.mjs
+++ b/tests/csv-state.test.mjs
@@ -120,6 +120,39 @@ test("pendingRows returns only pending work", () => {
   assert.equal(pend[0].testRunId, "102");
 });
 
+test("flip rejects a missing/non-terminal rca_done without mutating the row", () => {
+  seed(csv, "build-1", TESTS);
+  claim(csv, 101, "w1", 1000);
+  // missing rca_done
+  assert.equal(flip(csv, 101, { root_cause: "x" }, 2000), false);
+  // invalid rca_done
+  assert.equal(flip(csv, 101, { rca_done: "weird" }, 2000), false);
+  const row = readRows(csv).find((r) => r.testRunId === "101");
+  assert.equal(row.rca_done, PENDING); // not reverted to claimable-pending silently
+  assert.equal(row.in_flight_worker, "w1"); // claim intact — bug surfaces, no clobber
+  assert.equal(row.root_cause, ""); // nothing written
+});
+
+test("pending-resume is resumable: not terminal, listed, and re-claimable", () => {
+  seed(csv, "build-1", TESTS);
+  claim(csv, 101, "w1", 1000);
+  flip(csv, 101, { rca_done: "pending-resume", threadId: "thr-1", turnId: "t-1" }, 2000);
+  const row = readRows(csv).find((r) => r.testRunId === "101");
+  assert.equal(row.in_flight_worker, ""); // this attempt released the claim
+  assert.equal(row.threadId, "thr-1"); // resume handles retained
+  assert.equal(row.turnId, "t-1");
+  // appears in the fan-out work-list and can be claimed by the resume pass
+  assert.ok(pendingRows(csv).some((r) => r.testRunId === "101"));
+  assert.equal(claim(csv, 101, "w2", 3000), true);
+});
+
+test("reaper ignores pending-resume rows (not in flight)", () => {
+  seed(csv, "build-1", TESTS);
+  claim(csv, 101, "w1", 1000);
+  flip(csv, 101, { rca_done: "pending-resume" }, 2000);
+  assert.deepEqual(reaper(csv, 600, 10_000_000), []);
+});
+
 test("CSV codec round-trips fields with commas, quotes, newlines", () => {
   seed(csv, "build-1", [{ test_id: 200, test_name: "weird" }]);
   flip(

From 3d22dfe6d9e4cebbc00824749553a43b3c5e38b7 Mon Sep 17 00:00:00 2001
From: Ruturaj-Browserstack <ruturaj.s@browserstack.com>
Date: Tue, 23 Jun 2026 22:27:42 +0530
Subject: [PATCH 12/12] chore(rca): gitignore local planning docs

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 9045f9d..4c14fb3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,5 @@ node_modules/
 .env
 # Per-run RCA batch state (the CSV/WAL spine + report) is workspace-local.
 .rca/
+# Planning docs (brainstorm/ideation/plan) stay local — not pushed.
+docs/