From 6934cf9b8a53113b822596d465fce7d8a9e43c76 Mon Sep 17 00:00:00 2001 From: Ruturaj-Browserstack Date: Tue, 23 Jun 2026 21:54:55 +0530 Subject: [PATCH 01/12] =?UTF-8?q?feat(rca):=20scaffold=20plugin=20?= =?UTF-8?q?=E2=80=94=20manifest,=20MCP=20wiring,=20config,=20command,=20RE?= =?UTF-8?q?ADME?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Identity-only .claude-plugin/plugin.json; root .mcp.json wires the bstack MCP server (stdio); config/rca.config.json centralizes all formerly-hardcoded product/infra values (no kubectl/chitragupta/bifrost literals); /rca-build command parses build id + mode and hands off to the skill. Co-Authored-By: Claude Opus 4.8 --- .claude-plugin/plugin.json | 11 +++++++ .env.example | 8 +++++ .gitignore | 4 +++ .mcp.json | 14 ++++++++ README.md | 67 ++++++++++++++++++++++++++++++++++++-- commands/rca-build.md | 34 +++++++++++++++++++ config/rca.config.json | 25 ++++++++++++++ package.json | 10 ++++++ 8 files changed, 171 insertions(+), 2 deletions(-) create mode 100644 .claude-plugin/plugin.json create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 .mcp.json create mode 100644 commands/rca-build.md create mode 100644 config/rca.config.json create mode 100644 package.json diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..c8c4beb --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,11 @@ +{ + "name": "tfa-rca", + "description": "Drive collaborative root-cause analysis over all failed tests of a build, generic across product and infra.", + "version": "0.1.0", + "author": { + "name": "BrowserStack", + "url": "https://www.browserstack.com" + }, + "homepage": "https://github.com/browserstack/browserstack-ai-tfa-demo", + "license": "MIT" +} diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..d86819e --- /dev/null +++ b/.env.example @@ -0,0 +1,8 @@ +# BrowserStack credentials — used by the bundled bstack MCP server for +# listTestIds + tfaRcaTurn. Per-user; never commit real values. +BROWSERSTACK_USERNAME= +BROWSERSTACK_ACCESS_KEY= + +# Observability base URL the TFA RCA chat runs against. Optional — +# the bstack MCP server defaults to its rengg-tfa staging URL when unset. +# O11Y_TFA_RCA_BASE_URL=https://api-observability-rengg-tfa.bsstag.com diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9045f9d --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +node_modules/ +.env +# Per-run RCA batch state (the CSV/WAL spine + report) is workspace-local. +.rca/ diff --git a/.mcp.json b/.mcp.json new file mode 100644 index 0000000..0502929 --- /dev/null +++ b/.mcp.json @@ -0,0 +1,14 @@ +{ + "mcpServers": { + "bstack": { + "type": "stdio", + "command": "npx", + "args": ["-y", "@browserstack/mcp-server"], + "env": { + "BROWSERSTACK_USERNAME": "${BROWSERSTACK_USERNAME}", + "BROWSERSTACK_ACCESS_KEY": "${BROWSERSTACK_ACCESS_KEY}", + "O11Y_TFA_RCA_BASE_URL": "${O11Y_TFA_RCA_BASE_URL}" + } + } + } +} diff --git a/README.md b/README.md index 423d780..ff148ab 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,65 @@ -# browserstack-ai-tfa-demo -AI TFA Demo +# tfa-rca — generic multi-client RCA agent plugin + +Drive BrowserStack's collaborative root-cause-analysis loop over **all failed +tests of a build**, generic across product and infra, from inside an agentic +MCP client (Claude Code / Cursor / Codex). + +The plugin wraps two stable MCP tools — `listTestIds` and `tfaRcaTurn` (from the +`bstack` MCP server) — and adds the harness that batches RCA over a whole build, +clusters failures by signature, routes evidence requests to whatever +skills/tools the client already has, and writes a per-test RCA into the TRA +dashboard. + +> It **discovers and delegates** to the infra skills/tools already in your +> client (GitHub, k8s/EKS, kibana/other logs, metrics). It does **not** install +> or own those connectors. + +## Install + +```bash +git clone https://github.com/browserstack/browserstack-ai-tfa-demo.git +cd browserstack-ai-tfa-demo +cp .env.example .env # fill in BROWSERSTACK_USERNAME / BROWSERSTACK_ACCESS_KEY +claude --plugin-dir ./ +``` + +The plugin auto-configures on load: the `bstack` MCP server (from `.mcp.json`), +the `/rca-build` command, the `rca-build` skill, and the `ai-tfa-coordinator` +agent are all discovered by convention. + +## Usage + +``` +/rca-build +/rca-build build_id= mode=auto +``` + +On start the plugin runs a **mandatory pre-flight intake** asking for your +product + automation repos, working branch, default branch, and the PRs in +play, plus the build id. Every question is answerable with "I don't have one" → +the run proceeds RCA-only. + +## Modes + +- **auto** — a dynamic workflow drives the whole batch (5 tests concurrent), no + mid-run prompts. When evidence can't be gathered (no matching skill), it + reports "unavailable" back to the TFA agent, which finalizes best-effort. +- **interactive** — the main session spawns subagents (5 at a time); on an + evidence gap a subagent returns the gap to the main agent, which asks you, + then feeds the answer back. + +`auto` means autonomy *during* the batch from an interactive session — not +headless. Running `claude -p` with a required input missing ends immediately. + +## Requirements + +- The `bstack` MCP server (bundled via `.mcp.json`). +- Credentials in `.env` (or your client's MCP env). +- For full evidence coverage: whatever GitHub / infra / logging / metrics + skills your client already has. Missing ones degrade gracefully (the RCA's + confidence band reflects what evidence was actually available). + +## Layout + +See `docs/plans/2026-06-23-001-feat-generic-rca-agent-plugin-plan.md` for the +implementation plan and `docs/brainstorms/` for the requirements. diff --git a/commands/rca-build.md b/commands/rca-build.md new file mode 100644 index 0000000..7a7a829 --- /dev/null +++ b/commands/rca-build.md @@ -0,0 +1,34 @@ +--- +description: Run collaborative RCA over all failed tests of a BrowserStack build +--- + +# /rca-build + +Entry point for the generic RCA harness. Drives a collaborative root-cause +analysis loop over **every failed test** of a build, generic across product and +infra. + +## Input + +`$ARGUMENTS` carries the build id (and optional flags). Accepted forms: + +- bare build id: `qzqhbfa5bkjakcbxtvy2siwtpcvsvgm9fxfyb03d5` +- `build_id=` +- a build dashboard link (the id is extracted) +- optional `mode=auto` | `mode=interactive` (default: prompt the user) + +Parse the build id. If none is present, this is a required input: + +- in an interactive session → ask the user for it +- in headless (`claude -p`) → **end immediately** (fail fast), do not hang + +## Behavior + +Invoke the `rca-build` skill, passing the parsed build id and mode. The skill +owns the full flow: mandatory pre-flight GitHub intake → discovery via +`listTestIds` → CSV/WAL spine → failure-signature clustering → fan-out +(auto = dynamic workflow / interactive = subagents) → per-test RCA loop via +`tfaRcaTurn` → report. + +Do not re-implement the orchestration here — this command only parses input and +hands off to the skill. diff --git a/config/rca.config.json b/config/rca.config.json new file mode 100644 index 0000000..b7633bb --- /dev/null +++ b/config/rca.config.json @@ -0,0 +1,25 @@ +{ + "$comment": "Central config for the generic RCA harness. All formerly-hardcoded product/infra values live here. No kubectl/chitragupta/bifrost literals — infra tools are discovered at runtime via the capability manifest (see skills/rca-build/references/evidence-routing.md).", + "mcpServerName": "bstack", + "concurrency": 5, + "turnCap": 6, + "turnMessageMaxChars": 5000, + "pollSoftPendingMs": 90000, + "reaperHeartbeatTtlSec": 600, + "errorSummaryMaxChars": 200, + "paths": { + "stateDir": ".rca", + "csvFile": ".rca/rca-state.csv", + "reportFile": ".rca/rca-report.md" + }, + "evidenceRouting": { + "test_logs": { "owner": "tfa", "skip": true }, + "product_code": { "capability": "github", "discoveryHints": ["github-mcp", "gh"] }, + "deploy": { "capability": "github", "discoveryHints": ["github-mcp", "gh"] }, + "ci": { "capability": "github", "discoveryHints": ["github-mcp", "gh"] }, + "k8s": { "capability": "k8s", "discoveryHints": [] }, + "kibana": { "capability": "logs", "discoveryHints": [] }, + "metrics": { "capability": "metrics", "discoveryHints": [] }, + "other": { "capability": "other", "discoveryHints": [] } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..c11e40f --- /dev/null +++ b/package.json @@ -0,0 +1,10 @@ +{ + "name": "tfa-rca-plugin", + "version": "0.1.0", + "private": true, + "type": "module", + "description": "Generic multi-client RCA agent plugin harness", + "scripts": { + "test": "node --test tests/" + } +} From f0d5cf63e017a5669764a7f59446d16a59550d0b Mon Sep 17 00:00:00 2001 From: Ruturaj-Browserstack Date: Tue, 23 Jun 2026 21:58:47 +0530 Subject: [PATCH 02/12] feat(rca): generic per-test RCA coordinator + evidence-routing registry Port the obs-tfa-rca loop decoupled: ai-tfa-coordinator drives tfaRcaTurn to a terminal RCA (turn-cap, one-thread, soft-PENDING, digest-not-dump) with the gather mechanism routed by capability (no kubectl/chitragupta/bifrost literals). lib/routing.mjs classifies each ask skip/gather/gap against the config registry + capability manifest; the gap action is the only mode fork (auto=unavailable, interactive=ask-user). references/evidence-routing.md carries the digest format and size caps verbatim. Adds sibling pre-seed one-turn-confirm hook. Co-Authored-By: Claude Opus 4.8 --- agents/ai-tfa-coordinator.md | 185 ++++++++++++++++++ lib/routing.mjs | 75 +++++++ package.json | 2 +- .../rca-build/references/evidence-routing.md | 133 +++++++++++++ tests/routing.test.mjs | 80 ++++++++ 5 files changed, 474 insertions(+), 1 deletion(-) create mode 100644 agents/ai-tfa-coordinator.md create mode 100644 lib/routing.mjs create mode 100644 skills/rca-build/references/evidence-routing.md create mode 100644 tests/routing.test.mjs diff --git a/agents/ai-tfa-coordinator.md b/agents/ai-tfa-coordinator.md new file mode 100644 index 0000000..e2045fb --- /dev/null +++ b/agents/ai-tfa-coordinator.md @@ -0,0 +1,185 @@ +--- +name: ai-tfa-coordinator +description: 'Per-test collaborative-RCA coordinator. Given ONE testRunId, drives the tfaRcaTurn MCP loop to a terminal root cause: TFA reads the run logs; this coordinator supplies every non-log evidence ask (product code, k8s, kibana, metrics, deploy, ci) using whatever skills/tools the client has, routed through the capability manifest. Skips every test_logs ask (TFA owns logs). Emits a structured RCA_OUTPUT block. Generic over product and infra — no hardcoded tools. Examples: +- orchestrator: Agent(subagent_type="ai-tfa-coordinator", prompt="RCA testRunId=39 — error: empty buildName rejected on POST /builds") → drives the loop, returns RCA_OUTPUT +- sibling confirm: Agent(subagent_type="ai-tfa-coordinator", prompt="RCA testRunId=40 — pre-seed: cause=, suspect PR=#7421") → one-turn confirm against this test logs +- user: "run collaborative RCA on test run 39" → single-test loop to RESOLVED/BLOCKED/PENDING' +tools: [Bash, Read, Grep, Glob, Task, mcp__*__tfaRcaTurn, mcp__github__*] +model: sonnet +--- + +# Per-Test Collaborative RCA Coordinator (`ai-tfa-coordinator`) + +Drives the `tfaRcaTurn` MCP loop for a **single** failed test to a terminal RCA. +The collaboration contract is fixed: **TFA owns logs; this coordinator owns +everything else.** TFA (server-side, via the tool) reads the run's logs from its +own access and emits typed evidence asks; this coordinator fulfills every +**non-log** ask using whatever skills/tools the client has — routed through the +capability manifest — digests the findings, and feeds them back on the same +thread until TFA converges. TFA authors the RCA into the TRA dashboard. + +This coordinator is the **reusable unit**: it takes one `testRunId` and runs +standalone, driven by the auto workflow, an interactive subagent, or a thin +sequential harness. It is **generic over product and infra** — it names no +`kubectl` / `chitragupta` / `bifrost`; it routes by *capability*. + +## Inputs + +- `testRunId` — **required**, the integer test-run ID. Maps to the tool's `testRunId` arg. +- `error_digest` — optional short error title + endpoint (NOT logs) for the first-turn message. +- `pre_seed` — optional. For a **cluster sibling**: the representative's + `root_cause` + suspect `related_prs`. When present, the first-turn message + states the hypothesis and asks TFA to **confirm it against this test's own logs**. +- `resume` — optional `{ threadId, turnId }` from a prior PENDING run. +- `manifest` — the capability manifest `{ capability: { available, via } }` (from the orchestrator's pre-compute). +- `mode` — `auto` | `interactive`. Selects the **gap-resolver** (see below). + +If `testRunId` is missing or not parseable as an integer, emit a `failed` +`RCA_OUTPUT` block with `root_cause: "no testRunId provided"` and stop — do not +call the tool. + +## Operating principles + +1. **Logs by TFA — the core contract.** Never seed logs in the first turn; + **skip every ask with `evidenceType === "test_logs"`**. Never fetch, paste, or + digest log content. Logs are TFA's job. +2. **Read-only.** Every gather mechanism is read-only. Never write to a repo, + cluster, ticket, or the run. Produce a block and stop. +3. **Turn-cap** = `turnCap` from `config/rca.config.json` (default 6). If the cap + is hit while still `NEEDS_INFO`, end as `PENDING` (note `turn-cap`) — never an + extra turn, never a busy-wait. +4. **One thread per test.** First turn omits `threadId`; capture it from the + response and reuse it on every follow-up. Never start a second thread. +5. **Soft-PENDING ends the loop.** A tool result of `status: "PENDING"` (in-call + poll exceeded its wall-clock cap) ends the loop immediately as `PENDING`, + carrying `threadId` + `turnId` for a later resume. Do not re-poll or sleep. +6. **Digest, don't dump.** Every follow-up `message` carries digested findings + (`ask → found → snippet/link`), never raw log tails, full diffs, or full files. + Size caps + block shape live in `references/evidence-routing.md` — read it + before fulfilling any ask. The tool caps `message` at 5000 chars. +7. **Report gaps, don't drop them.** An ask the coordinator cannot fulfill becomes + a `not-found` / `unreachable` / `unavailable` block, never a silent omission. +8. **Never editorialize.** Report findings (suspect PR, server-side error line), + not verdicts. The root cause is TFA's to state on `RESOLVED`; pass its `rca` + through verbatim. + +## The gap-resolver (mode fork) + +Routing an ask yields `skip` / `gather` / `gap` (see `references/evidence-routing.md`). +The only behavioral difference between modes is what happens on a **gap** (no +capability available for that `evidenceType`): + +- **auto** → emit an `unavailable` block back to TFA (no user prompt). TFA + finalizes best-effort with lower confidence. +- **interactive** → **return the gap to the caller** (the main agent), which asks + the user (A1) for that data, then feeds the answer back. A subagent cannot + prompt the user itself. + +Everything else — the loop, routing, digest, caps, output — is identical across +modes. Do not fork the loop; only the gap action differs. + +## The loop + +``` +0. Parse inputs → testRunId (int). Build the first-turn DIGEST: + - pre_seed present → "Hypothesis from cluster representative: . + Suspect PR(s): . Confirm against THIS test's logs." (NO logs) + - error_digest present → "Error: " (NO logs, NO threadId) + - neither → "Initiating collaborative RCA for test run <id>." +1. SUBMIT turn 1: tfaRcaTurn(testRunId=<id>, message=<digest>). Capture threadId. turns_used = 1. + (resume case: tfaRcaTurn(testRunId, threadId, turnId) instead, then continue at 2.) +2. CLASSIFY result.status: + RESOLVED → capture rca; END (RESOLVED). + BLOCKED → capture reason + unmetAsks; END (BLOCKED). + PENDING → capture threadId + turnId; END (PENDING, note "soft-pending"). + NEEDS_INFO → go to 3. +3. ROUTE the asks (read references/evidence-routing.md; route via lib/routing.mjs): + For each ask, high → medium → low: + skip → record in asks_skipped, emit nothing. + gather → run the discovered skill/tool for its capability, digest into one block. + Record evidenceType in asks_fulfilled (dedupe). + gap → run the mode's gap-resolver (auto: unavailable block; interactive: return to caller). + Concatenate per-ask blocks into the next-turn MESSAGE (respect size caps). +4. SUBMIT follow-up on the SAME thread: tfaRcaTurn(testRunId, message, threadId). turns_used += 1. +5. TURN-CAP CHECK: if turns_used >= turnCap and still NEEDS_INFO → END (PENDING, "turn-cap"). + else → go to 2 with the new result. +6. EMIT the RCA_OUTPUT block from the captured terminal state. +``` + +**Sibling confirm (cluster member).** When `pre_seed` is present the first turn +states the representative's hypothesis and asks TFA to confirm against this +test's own logs. If TFA `RESOLVED`s in one turn → a logs-grounded per-test RCA at +minimal cost. If TFA instead returns `NEEDS_INFO` / `BLOCKED` (the hypothesis +does not hold for this test), **fall back to the normal loop** — never blindly +inherit the representative's cause. + +## Output contract — `RCA_OUTPUT` + +Emit **exactly one** block at the end of every run (including the `failed` +no-input case). The orchestrator parses it into one CSV row / report record. + +``` +RCA_OUTPUT_START + +## testRunId +<integer> + +## status +<RESOLVED | BLOCKED | PENDING | failed> + +## confidence +<high | medium | low | unknown> # from the terminal turn; unknown for PENDING/failed + +## root_cause +<RESOLVED → rca.root_cause verbatim · BLOCKED → TFA's reason · PENDING/failed → "not available" or the note> + +## possible_fix +<RESOLVED → rca.possible_fix verbatim · else "not available"> + +## related_prs +- <each PR TFA recorded in rca.related_prs; "none" if empty> + +## suspect_signals +- <each non-log signal surfaced: suspect PR / deploy / server-side error line; "none" if empty> + +## thread_id +<threadId from the first turn · "not available" if none> + +## turn_id +<turnId — present for PENDING (resume handle); else "not available"> + +## turns_used +<integer 1..turnCap> + +## asks_fulfilled +- <evidenceType> # every non-test_logs type fulfilled; "none" if empty + +## asks_skipped +- test_logs # present once a test_logs ask appeared + +## asks_unavailable +- <evidenceType> # gaps with no capability (drives the coverage stamp, U10); "none" if empty + +RCA_OUTPUT_END +``` + +Notes: +- `status` is one of exactly four values. `turn-cap` and `soft-pending` both + report as `PENDING`; note which in `root_cause`. +- `asks_skipped` always includes `test_logs` whenever TFA asked for logs. + `asks_fulfilled` **never** includes `test_logs`. +- `asks_unavailable` is the evidence-coverage signal U10 turns into a confidence band. +- `failed` is the no-parseable-result / no-input case; the orchestrator + synthesizes a `failed` row if this coordinator dies — keep the block valid. + +## Hard limits + +- **Never** fulfill or seed a `test_logs` ask — TFA owns logs. +- **Never** exceed `turnCap` `tfaRcaTurn` calls in one run. +- **Never** start a second thread for the same test — reuse the first turn's `threadId`. +- **Never** busy-wait / re-poll on a soft-`PENDING` — end and report it resumable. +- **Never** dump raw logs, full diffs, or full file contents into a turn message — digest only. +- **Never** write to any repo / cluster / ticket / the run — every action is read-only. +- **Never** editorialize a cause — pass TFA's `rca` through verbatim. +- **Never** blindly inherit a representative's cause for a sibling — confirm against its own logs. +- **Always** emit exactly one valid `RCA_OUTPUT` block, even on the `failed` path. diff --git a/lib/routing.mjs b/lib/routing.mjs new file mode 100644 index 0000000..291738e --- /dev/null +++ b/lib/routing.mjs @@ -0,0 +1,75 @@ +// Evidence-routing registry (D3). Maps a TFA `ask.evidenceType` onto an +// action, given the run's capability manifest. Pure + dependency-free so it is +// testable and reusable by both the auto workflow and interactive subagents. +// +// `test_logs` is the TFA agent's own evidence and is always skipped. Every +// other type routes to a capability; whether that capability is *available* is +// decided by the manifest (built once per run — see U6 / buildManifest). + +import { readFileSync } from "node:fs"; + +export const TEST_LOGS = "test_logs"; + +const PRIORITY_RANK = { high: 0, medium: 1, low: 2 }; + +// Load and parse config/rca.config.json from an absolute or cwd-relative path. +export function loadConfig(configPath) { + return JSON.parse(readFileSync(configPath, "utf8")); +} + +// Order a turn's asks high → medium → low (unknown priority sorts last). +export function orderAsks(asks = []) { + return [...asks].sort( + (a, b) => + (PRIORITY_RANK[a?.priority] ?? 99) - (PRIORITY_RANK[b?.priority] ?? 99), + ); +} + +// Classify one ask. Returns one of: +// { action: "skip", ... } — test_logs / TFA-owned; the coordinator emits nothing +// { action: "gather", ... } — a capability is available; gather + digest +// { action: "gap", ... } — no capability; the caller's resolveGap() decides +// (auto → "unavailable" block; interactive → ask the user) +// +// `manifest` shape: { [capability]: { available: boolean, via?: string } }. +export function routeAsk(ask, config, manifest = {}) { + const evidenceType = ask?.evidenceType ?? "other"; + const routing = config?.evidenceRouting ?? {}; + const entry = routing[evidenceType] ?? routing.other ?? { capability: "other" }; + + if (entry.skip || entry.owner === "tfa") { + return { evidenceType, action: "skip", reason: "tfa-owned" }; + } + + const capability = entry.capability ?? "other"; + const cap = manifest[capability]; + if (cap && cap.available) { + return { + evidenceType, + action: "gather", + capability, + via: cap.via ?? null, + }; + } + + return { + evidenceType, + action: "gap", + capability, + discoveryHints: entry.discoveryHints ?? [], + reason: "no-capability", + }; +} + +// Split a turn's asks into the three buckets, in priority order. The +// coordinator gathers `gather`, runs resolveGap() on each `gap`, and records +// `skip` (test_logs) without emitting anything. +export function routeAsks(asks, config, manifest = {}) { + const ordered = orderAsks(asks); + const buckets = { skip: [], gather: [], gap: [] }; + for (const ask of ordered) { + const routed = routeAsk(ask, config, manifest); + buckets[routed.action].push({ ask, ...routed }); + } + return buckets; +} diff --git a/package.json b/package.json index c11e40f..27344a7 100644 --- a/package.json +++ b/package.json @@ -5,6 +5,6 @@ "type": "module", "description": "Generic multi-client RCA agent plugin harness", "scripts": { - "test": "node --test tests/" + "test": "node --test" } } diff --git a/skills/rca-build/references/evidence-routing.md b/skills/rca-build/references/evidence-routing.md new file mode 100644 index 0000000..e6cc4d0 --- /dev/null +++ b/skills/rca-build/references/evidence-routing.md @@ -0,0 +1,133 @@ +# Evidence Routing + +Load this file **before fulfilling any `NEEDS_INFO` ask** in the per-test RCA +loop (`agents/ai-tfa-coordinator`). It maps each TFA `evidenceType` to a +**capability** (not a hardcoded tool), and defines the **digest** the coordinator +submits on the next turn. + +The core contract: **TFA owns logs; the client agent owns everything else.** The +coordinator never seeds logs and never fulfills a `test_logs` ask. Every other +`evidenceType` routes to a capability that is gathered via **whatever skill/tool +the client actually has** for it (discovered once into the capability manifest — +see `SKILL.md` § Pre-compute). There are **no `kubectl` / `chitragupta` / +`bifrost` literals here** — that is the whole point of going generic. + +The registry logic lives in `lib/routing.mjs` (`routeAsk` / `routeAsks`); this +file is the human/agent-facing contract for the digest and the size caps. + +--- + +## How a turn's asks are processed + +A `NEEDS_INFO` turn returns `asks: TfaAsk[]`, each `{ what, why, evidenceType, +priority }`. For each ask, in descending `priority` (`high` → `medium` → `low`): + +1. Route the `evidenceType` (via `lib/routing.mjs` → the config registry + + capability manifest). The result is one of three actions: + - **skip** — `test_logs` (TFA-owned). Gather nothing; record in `asks_skipped`. + - **gather** — a capability is available. Run its discovered skill/tool scoped + by `what` / `why`, then digest the result into one ask block. + - **gap** — no capability is available. Hand the ask to the injected + **`resolveGap()`** policy: + - **auto mode** → emit an `unavailable` block back to TFA (no user prompt). + - **interactive mode** → return the gap to the main agent, which asks the + user, then feeds the answer back. +2. Concatenate the per-ask blocks into the next-turn `message` and resubmit on + the same `threadId`. + +An ask that cannot be fulfilled is **never silently dropped** — it becomes a +`not-found` / `unreachable` / `unavailable` block so TFA can reason about the gap. + +--- + +## Routing table (capability, not tool) + +`evidenceType` literals are exactly those `tfaRcaTurn` emits: `test_logs`, +`product_code`, `k8s`, `kibana`, `metrics`, `deploy`, `ci`, `other`. + +| `evidenceType` | Capability | Gathered via (discovered at runtime) | +|---|---|---| +| `test_logs` | — (TFA, skip) | never gathered; TFA self-serves from its own log access | +| `product_code` | `github` | the client's GitHub capability — **GitHub MCP if present, else `gh`** (see `references/github-evidence.md`) | +| `deploy` | `github` | deploy timeline via the GitHub capability (releases/tags + deploy record) | +| `ci` | `github` | CI config + run history via the GitHub capability | +| `k8s` | `k8s` | whatever k8s/EKS skill the client has — discovered, not assumed | +| `kibana` | `logs` | whatever log-search skill the client has (kibana or other) | +| `metrics` | `metrics` | whatever metrics skill the client has | +| `other` | `other` | best-effort by ask text; else a `not-found` block | + +The mapping is data in `config/rca.config.json` (`evidenceRouting`), so a +different deployment can remap `evidenceType → capability` without code changes. + +**Deployment-state guard:** a suspect PR only matters if its code was actually +live in the run's env at the failure window. If you can cheaply confirm it was +not deployed / behind an OFF flag, say so in the digest rather than feeding TFA a +suspect that could not have caused the failure. (Full protocol: U9 / +`references/github-evidence.md`.) + +--- + +## Digest format + +The single most important discipline: **digested input, not raw dumps.** Every +turn's `message` loads into the agent's context *and* is sent to TFA; a raw log +tail or full PR diff blows both budgets and degrades TFA's reasoning. Supply the +*findings*, not the *haystack*. + +### Per-ask block shape — `ask → found → snippet/link` + +``` +ASK: <verbatim `what` from the TfaAsk, ≤ 120 chars> +TYPE: <evidenceType> +FOUND: <yes | no | partial> +SUMMARY: <1–3 sentences — the finding, in the agent's words. ≤ 400 chars> +SNIPPET: + <the load-bearing excerpt only — see size caps. Omit if a LINK fully carries it.> +LINK: <permalink to the source — PR/commit/log-search/metrics panel/deploy record. Omit if N/A.> +``` + +- `SUMMARY` is the answer. `SNIPPET` is the *minimum* evidence backing it. `LINK` + lets TFA (or a human) verify without the bytes living in the message. +- Prefer **LINK over SNIPPET** whenever a permalink fully carries the evidence. + +### Size caps (hard ceilings — truncate, never exceed) + +| Field / scope | Soft target | Hard ceiling | On exceed | +|---|---|---|---| +| `SUMMARY` | ≤ 300 chars | 400 chars | Tighten to the finding; drop restatement of the ask | +| `SNIPPET` per ask | ≤ 20 lines | 40 lines | Keep the load-bearing lines; replace the rest with `… (N lines elided — see LINK)` | +| Code diff in a `product_code` snippet | ≤ 1 hunk | 3 hunks | Show changed lines + 3 lines context; link the full PR | +| Whole next-turn `message` | ≤ 200 lines | 400 lines (and ≤ `turnMessageMaxChars`) | Drop `low`-priority asks first; keep every `high` ask's block | +| Asks fulfilled per turn | all `high` + `medium` | — | Defer `low` asks to a later turn rather than truncating a `high` ask | + +Truncation rule of thumb: **never truncate a `high`-priority ask's block to fit a +`low`-priority one.** Drop the low block whole; keep the high block intact. The +whole-message ceiling also honors `turnMessageMaxChars` from +`config/rca.config.json` (the tool caps `message` at 5000 chars). + +### What never goes in a digest + +- Raw log tails, full log output, full file contents, full PR diffs — link or excerpt. +- `test_logs` content of any kind (TFA owns it). +- Credentials, tokens, internal hostnames, or any secret surfaced by an env/secret dump. +- Speculation dressed as a finding. If `FOUND: no`, say what was checked; do not invent a cause. + +--- + +## Unfulfillable asks — report, don't drop + +``` +ASK: <verbatim what> +TYPE: <evidenceType> +FOUND: no +SUMMARY: not-found | unreachable | unavailable | out-of-scope — <one line: what was checked or why blocked> +``` + +- `not-found` — the skill/tool ran but the signal isn't there. State the search performed. +- `unreachable` — the surface was not reachable from this agent context. State which. +- `unavailable` — no capability/skill exists for this `evidenceType` (auto-mode gap result). +- `out-of-scope` — the ask is `test_logs` or otherwise not the agent's to fulfill. + +An all-`unavailable` / all-`not-found` turn still resubmits — TFA decides whether +the gap is fatal (→ BLOCKED) or it can converge anyway (best-effort, lower +confidence). The coordinator does not pre-empt that decision. diff --git a/tests/routing.test.mjs b/tests/routing.test.mjs new file mode 100644 index 0000000..c63b595 --- /dev/null +++ b/tests/routing.test.mjs @@ -0,0 +1,80 @@ +import { test } from "node:test"; +import assert from "node:assert/strict"; +import { routeAsk, routeAsks, orderAsks, TEST_LOGS } from "../lib/routing.mjs"; + +const CONFIG = { + evidenceRouting: { + test_logs: { owner: "tfa", skip: true }, + product_code: { capability: "github", discoveryHints: ["github-mcp", "gh"] }, + k8s: { capability: "k8s", discoveryHints: [] }, + other: { capability: "other", discoveryHints: [] }, + }, +}; + +test("test_logs is always skipped (TFA-owned)", () => { + const r = routeAsk({ evidenceType: TEST_LOGS, priority: "high" }, CONFIG, { + github: { available: true }, + }); + assert.equal(r.action, "skip"); + assert.equal(r.reason, "tfa-owned"); +}); + +test("available capability → gather, carrying via", () => { + const r = routeAsk({ evidenceType: "product_code", priority: "high" }, CONFIG, { + github: { available: true, via: "github-mcp" }, + }); + assert.equal(r.action, "gather"); + assert.equal(r.capability, "github"); + assert.equal(r.via, "github-mcp"); +}); + +test("unavailable capability → gap, carrying discovery hints", () => { + const r = routeAsk({ evidenceType: "k8s", priority: "medium" }, CONFIG, { + k8s: { available: false }, + }); + assert.equal(r.action, "gap"); + assert.equal(r.capability, "k8s"); + assert.equal(r.reason, "no-capability"); +}); + +test("capability absent from manifest entirely → gap", () => { + const r = routeAsk({ evidenceType: "k8s", priority: "low" }, CONFIG, {}); + assert.equal(r.action, "gap"); +}); + +test("unknown evidenceType falls back to the 'other' entry", () => { + const r = routeAsk({ evidenceType: "weird", priority: "low" }, CONFIG, { + other: { available: true, via: "best-effort" }, + }); + assert.equal(r.action, "gather"); + assert.equal(r.capability, "other"); +}); + +test("orderAsks sorts high → medium → low, unknown last", () => { + const ordered = orderAsks([ + { what: "c", priority: "low" }, + { what: "a", priority: "high" }, + { what: "d", priority: undefined }, + { what: "b", priority: "medium" }, + ]); + assert.deepEqual( + ordered.map((a) => a.what), + ["a", "b", "c", "d"], + ); +}); + +test("routeAsks buckets a mixed turn in priority order", () => { + const buckets = routeAsks( + [ + { evidenceType: "k8s", priority: "low" }, + { evidenceType: "test_logs", priority: "high" }, + { evidenceType: "product_code", priority: "high" }, + ], + CONFIG, + { github: { available: true, via: "gh" } }, + ); + assert.equal(buckets.skip.length, 1); + assert.equal(buckets.gather.length, 1); + assert.equal(buckets.gap.length, 1); + assert.equal(buckets.gather[0].evidenceType, "product_code"); +}); From cb0d8f6d1408267f06619893c6202891c362b648 Mon Sep 17 00:00:00 2001 From: Ruturaj-Browserstack <ruturaj.s@browserstack.com> Date: Tue, 23 Jun 2026 22:01:27 +0530 Subject: [PATCH 03/12] feat(rca): pre-flight intake, discovery, CSV/WAL spine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SKILL.md orchestrator spec: mandatory GitHub intake ('I don't have one' → RCA-only; headless missing-input fail-fast), discovery via listTestIds(failed, includeFailureDetail), then cluster/pre-compute/fan-out/report steps. lib/csv-state.mjs is the resumable WAL spine — seed (idempotent, terminal- preserving), claim/heartbeat/flip, reaper, pendingRows — with timestamps injected (workflow-sandbox-safe) and an RFC4180 codec for multiline RCA fields. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> --- lib/csv-state.mjs | 239 ++++++++++++++++++++++++++++++++++++++ skills/rca-build/SKILL.md | 124 ++++++++++++++++++++ tests/csv-state.test.mjs | 133 +++++++++++++++++++++ 3 files changed, 496 insertions(+) create mode 100644 lib/csv-state.mjs create mode 100644 skills/rca-build/SKILL.md create mode 100644 tests/csv-state.test.mjs diff --git a/lib/csv-state.mjs b/lib/csv-state.mjs new file mode 100644 index 0000000..499f997 --- /dev/null +++ b/lib/csv-state.mjs @@ -0,0 +1,239 @@ +// CSV write-ahead-log spine for the batch (D4 + ideation #7). The CSV is the +// single durable, resumable source of truth for "RCA over ALL failed tests": +// every test is a row, seeded `pending`, claimed by a worker, heartbeated while +// in flight, and flipped to a terminal state with its RCA. A reaper reclaims +// rows stranded by a crashed worker. +// +// Timestamps are passed in as `nowMs` (never read from the clock here) so this +// module is deterministic in tests AND usable from the auto-mode dynamic +// workflow, whose sandbox forbids Date.now(). +// +// In-session / in-workspace only — cross-session durability is deferred. Writes +// are synchronous read-modify-write; Node's single thread serializes them, which +// is sufficient for the in-process 5-concurrent workflow (true multi-process +// locking is out of scope). + +import { readFileSync, writeFileSync, existsSync, mkdirSync } from "node:fs"; +import { dirname } from "node:path"; + +export const COLUMNS = [ + "buildId", + "testRunId", + "testName", + "failure_category", + "error_summary", + "file_path", + "cluster_id", + "rca_done", + "in_flight_worker", + "heartbeat_ts", + "threadId", + "turnId", + "last_evidence_digest", + "root_cause", + "failure_type", + "possible_fix", + "related_prs", + "coverage", + "confidence", + "timestamp", +]; + +export const PENDING = "pending"; +const TERMINAL_STATES = new Set([ + "resolved", + "blocked", + "failed", + "pending-resume", +]); + +// ---- minimal RFC4180-ish CSV codec ---------------------------------------- + +function encodeField(value) { + const s = value == null ? "" : String(value); + if (/[",\r\n]/.test(s)) { + return `"${s.replace(/"/g, '""')}"`; + } + return s; +} + +function encodeRows(rows) { + const lines = [COLUMNS.join(",")]; + for (const row of rows) { + lines.push(COLUMNS.map((c) => encodeField(row[c])).join(",")); + } + return lines.join("\n") + "\n"; +} + +function parseCsv(text) { + const rows = []; + let field = ""; + let record = []; + let inQuotes = false; + for (let i = 0; i < text.length; i++) { + const ch = text[i]; + if (inQuotes) { + if (ch === '"') { + if (text[i + 1] === '"') { + field += '"'; + i++; + } else { + inQuotes = false; + } + } else { + field += ch; + } + } else if (ch === '"') { + inQuotes = true; + } else if (ch === ",") { + record.push(field); + field = ""; + } else if (ch === "\n" || ch === "\r") { + if (ch === "\r" && text[i + 1] === "\n") i++; + record.push(field); + rows.push(record); + field = ""; + record = []; + } else { + field += ch; + } + } + if (field.length > 0 || record.length > 0) { + record.push(field); + rows.push(record); + } + return rows; +} + +// ---- read / write ---------------------------------------------------------- + +export function readRows(csvPath) { + if (!existsSync(csvPath)) return []; + const text = readFileSync(csvPath, "utf8"); + const raw = parseCsv(text).filter((r) => r.some((c) => c.length > 0)); + if (raw.length === 0) return []; + const header = raw[0]; + return raw.slice(1).map((cells) => { + const row = {}; + header.forEach((col, idx) => { + row[col] = cells[idx] ?? ""; + }); + return row; + }); +} + +export function writeRows(csvPath, rows) { + const dir = dirname(csvPath); + if (dir && !existsSync(dir)) mkdirSync(dir, { recursive: true }); + writeFileSync(csvPath, encodeRows(rows), "utf8"); +} + +function emptyRow() { + return Object.fromEntries(COLUMNS.map((c) => [c, ""])); +} + +// ---- operations ------------------------------------------------------------- + +// Seed the CSV from a listTestIds(failed, includeFailureDetail) payload. Every +// row starts `pending`. Idempotent: existing rows are preserved (terminal rows +// are never reset; signature columns are refreshed on still-pending rows). New +// tests are appended. Returns the full row set. +export function seed(csvPath, buildId, tests) { + const existing = readRows(csvPath); + const byId = new Map(existing.map((r) => [String(r.testRunId), r])); + + for (const t of tests) { + const id = String(t.test_id ?? t.testRunId); + const sig = t.failure ?? {}; + const prior = byId.get(id); + if (prior) { + // Keep terminal results; only refresh signature on still-pending rows. + if (prior.rca_done === PENDING) { + prior.failure_category = sig.category ?? prior.failure_category; + prior.error_summary = sig.error_summary ?? prior.error_summary; + prior.file_path = sig.file_path ?? prior.file_path; + } + continue; + } + const row = emptyRow(); + row.buildId = buildId; + row.testRunId = id; + row.testName = t.test_name ?? t.testName ?? `Test ${id}`; + row.failure_category = sig.category ?? ""; + row.error_summary = sig.error_summary ?? ""; + row.file_path = sig.file_path ?? ""; + row.rca_done = PENDING; + byId.set(id, row); + existing.push(row); + } + + writeRows(csvPath, existing); + return existing; +} + +// Claim a pending row for `worker`. Refuses (returns false) if another worker +// already owns it. Returns true on success. +export function claim(csvPath, testRunId, worker, nowMs) { + const rows = readRows(csvPath); + const row = rows.find((r) => String(r.testRunId) === String(testRunId)); + if (!row) return false; + if (row.in_flight_worker && row.in_flight_worker !== worker) return false; + if (TERMINAL_STATES.has(row.rca_done)) return false; + row.in_flight_worker = worker; + row.heartbeat_ts = String(nowMs); + writeRows(csvPath, rows); + return true; +} + +export function heartbeat(csvPath, testRunId, worker, nowMs) { + const rows = readRows(csvPath); + const row = rows.find((r) => String(r.testRunId) === String(testRunId)); + if (!row || row.in_flight_worker !== worker) return false; + row.heartbeat_ts = String(nowMs); + writeRows(csvPath, rows); + return true; +} + +// Flip a row to a terminal state, recording the RCA fields and clearing the +// in-flight claim. `fields` carries any of: rca_done, root_cause, failure_type, +// possible_fix, related_prs, threadId, turnId, coverage, confidence, +// last_evidence_digest, cluster_id. +export function flip(csvPath, testRunId, fields, nowMs) { + const rows = readRows(csvPath); + const row = rows.find((r) => String(r.testRunId) === String(testRunId)); + if (!row) return false; + for (const [k, v] of Object.entries(fields)) { + if (COLUMNS.includes(k)) { + row[k] = Array.isArray(v) ? v.join("; ") : (v ?? ""); + } + } + row.in_flight_worker = ""; + row.timestamp = String(nowMs); + writeRows(csvPath, rows); + return true; +} + +// Reclaim rows stranded in flight (heartbeat older than ttlSec) back to pending. +// Returns the testRunIds reclaimed. Run on startup before resuming a batch. +export function reaper(csvPath, ttlSec, nowMs) { + const rows = readRows(csvPath); + const reclaimed = []; + for (const row of rows) { + if (!row.in_flight_worker) continue; + if (TERMINAL_STATES.has(row.rca_done)) continue; + const hb = Number(row.heartbeat_ts); + const stale = !row.heartbeat_ts || nowMs - hb > ttlSec * 1000; + if (stale) { + row.in_flight_worker = ""; + row.rca_done = PENDING; + reclaimed.push(String(row.testRunId)); + } + } + if (reclaimed.length > 0) writeRows(csvPath, rows); + return reclaimed; +} + +// Rows still needing work (pending or reclaimed). The work-list for fan-out. +export function pendingRows(csvPath) { + return readRows(csvPath).filter((r) => r.rca_done === PENDING); +} diff --git a/skills/rca-build/SKILL.md b/skills/rca-build/SKILL.md new file mode 100644 index 0000000..4bf6b34 --- /dev/null +++ b/skills/rca-build/SKILL.md @@ -0,0 +1,124 @@ +--- +name: rca-build +description: Run collaborative root-cause analysis over ALL failed tests of a BrowserStack build. Generic across product and infra. Mandatory pre-flight GitHub intake, then discovery via listTestIds, failure-signature clustering, and per-test RCA via tfaRcaTurn (auto = dynamic workflow / interactive = subagents). Use when a build is red and you want a per-test RCA for every failure in the TRA dashboard. +--- + +# rca-build — batch collaborative RCA over a build + +Drives the `tfaRcaTurn` collaborative loop over **every failed test** of a build +and records a per-test RCA. **TFA owns logs; the client agent owns everything +else** (product code, k8s, kibana, metrics, deploy, ci) — routed by capability, +generic over product and infra. + +This skill is the **build-level orchestrator** (`ai-tfa-orchestrator` role). It +never calls `tfaRcaTurn` itself — it dispatches the `ai-tfa-coordinator` +(test-level) per test/cluster member, which drives the loop and lets TFA author +the dashboard RCA. + +Config (concurrency, turn-cap, paths, evidence registry) lives in +`config/rca.config.json`. State lives in the CSV/WAL spine (`lib/csv-state.mjs`). + +## Step 0 — mode + input + +Parse from `/rca-build` args: the build id and optional `mode=auto|interactive`. + +- No build id present → it is required: + - interactive session → ask the user. + - **headless (`claude -p`) with build id missing → end immediately (fail fast).** +- No mode given → ask the user once (auto vs interactive). In headless, default `auto`. + +## Step 1 — pre-flight intake (F1, mandatory, both modes) + +Ask the user (A1) for, in one pass: + +- product repo name, automation (test) repo name +- working branch, default branch +- the PRs in play (product + automation) +- the build id (if not already supplied) + +Every question is **mandatory to ask** but answerable with **"I don't have one"** +→ record the gap and proceed **RCA-only** (BrowserStack-side evidence + whatever +infra skills exist). Do not block the run on missing GitHub context. + +**Headless rule:** in `claude -p`, any *required* input still missing after +parsing (build id) ends the run immediately. Optional intake answers default to +"none" without prompting. + +## Step 2 — discovery (F2) + +Call the bundled MCP tool: + +``` +listTestIds(buildId=<id>, status="failed", includeFailureDetail=true) +``` + +`includeFailureDetail=true` returns each row's trimmed failure signature +(`failure.{category, error_summary, file_path, …}`) — the seed for clustering, so +no per-test probe turns are needed. + +Seed the CSV/WAL spine from the payload (`lib/csv-state.mjs` → `seed`): one row +per failed test, every row `rca_done=pending`, signature columns populated. +Re-running `seed` on an existing CSV is idempotent and preserves terminal rows +(resume-safe). If `listTestIds` returns empty → write an empty CSV, report "no +failed tests", stop. + +## Step 3 — failure-signature clustering (see references/clustering.md) + +Compute a failure signature per row and assign `cluster_id` (`lib/signature.mjs`). +Each cluster gets one **representative** (full multi-turn loop) and `N−1` +**siblings** (pre-seeded one-turn confirm against their own logs). This collapses +the expensive evidence hunt to O(distinct causes) while every test still lands a +per-test RCA. Singleton clusters are just plain per-test loops. + +## Step 4 — build-evidence pre-compute + capability manifest (see references/evidence-routing.md) + +Once, before fan-out: + +- **Capability manifest** — enumerate the skills/tools the client actually has + into `capability → {available, via}` (GitHub, k8s, logs, metrics, …). Declare + to the user up front what will be **unavailable** ("k8s + metrics not + available"). Every coordinator routes asks against this manifest. +- **Build-level evidence** — compute the last-green→this-build delta (diff, + deploy timeline, suspect-PR window) **once** and pre-seed every coordinator + with the same grounded window. Cache by `(repo, commit-range)`. No "last green" + baseline (never-green suite) → fall back to a configured baseline ref and log it. + +## Step 5 — fan-out (the mode fork) + +Drive the cluster work-list, **`concurrency` (default 5) at a time**: +representatives deep, siblings one-turn-confirm. Eagerly persist to the CSV/WAL +(claim → heartbeat → flip) so the run is resumable. + +- **auto** → run the dynamic workflow `workflows/rca-batch.mjs` (script-orchestrated, + no user input; gap → "unavailable" back to TFA → best-effort finalize). +- **interactive** → spawn `ai-tfa-coordinator` subagents 5 at a time; on an + evidence gap a subagent returns the gap to this orchestrator, which asks the + user (A1), then feeds the answer back. Subagents return compact `RCA_OUTPUT` + blocks, not transcripts (keeps the main context lean for large batches). + +Both modes use the **same** `ai-tfa-coordinator`; only the injected gap-resolver +differs. A coordinator that dies becomes a recorded `failed` row — one stuck test +never sinks the batch (partial-first). + +## Step 6 — report (see references/report-format.md) + +When every row is terminal, render the report (`paths.reportFile`): per-test rows +with status + the **evidence-coverage band** (a RESOLVED built with evidence +unavailable reads as lower confidence than a fully-evidenced one). Degrade, +don't crash — missing fields render as "not available". + +## Resume + +On startup, run the reaper (`lib/csv-state.mjs` → `reaper`) to reclaim rows +stranded `in_flight` by a crashed worker (heartbeat older than +`reaperHeartbeatTtlSec`) back to `pending`, then re-point fan-out at the CSV. +Live `threadId`/`turnId` resume the prior thread; dead threads re-run from +pending. (In-session only — cross-session durability is deferred.) + +## Hard rules + +- Always run the pre-flight intake; never silently skip it (but never block on "I don't have one"). +- Headless + missing required input → end immediately. +- Never call `tfaRcaTurn` from this skill — always via the `ai-tfa-coordinator`. +- Every failed test must end terminal in the CSV — partial-first, no abort-on-one-failure. +- Never gather `test_logs` — TFA owns logs. diff --git a/tests/csv-state.test.mjs b/tests/csv-state.test.mjs new file mode 100644 index 0000000..5a9a60f --- /dev/null +++ b/tests/csv-state.test.mjs @@ -0,0 +1,133 @@ +import { test, beforeEach, afterEach } from "node:test"; +import assert from "node:assert/strict"; +import { mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { + seed, + readRows, + claim, + heartbeat, + flip, + reaper, + pendingRows, + PENDING, +} from "../lib/csv-state.mjs"; + +let dir; +let csv; + +beforeEach(() => { + dir = mkdtempSync(join(tmpdir(), "rca-csv-")); + csv = join(dir, "state.csv"); +}); +afterEach(() => rmSync(dir, { recursive: true, force: true })); + +const TESTS = [ + { + test_id: 101, + test_name: "login", + failure: { category: "Assertion", error_summary: "expected 200", file_path: "a.rb" }, + }, + { test_id: 102, test_name: "checkout", failure: { category: "Timeout" } }, +]; + +test("seed writes one pending row per test with signature columns", () => { + const rows = seed(csv, "build-1", TESTS); + assert.equal(rows.length, 2); + assert.ok(rows.every((r) => r.rca_done === PENDING)); + const login = rows.find((r) => r.testRunId === "101"); + assert.equal(login.failure_category, "Assertion"); + assert.equal(login.error_summary, "expected 200"); + assert.equal(login.buildId, "build-1"); +}); + +test("seed is idempotent — no duplicate rows on re-seed", () => { + seed(csv, "build-1", TESTS); + const rows = seed(csv, "build-1", TESTS); + assert.equal(rows.length, 2); +}); + +test("seed preserves a terminal row on re-seed", () => { + seed(csv, "build-1", TESTS); + flip(csv, 101, { rca_done: "resolved", root_cause: "bad PR" }, 1000); + seed(csv, "build-1", TESTS); + const login = readRows(csv).find((r) => r.testRunId === "101"); + assert.equal(login.rca_done, "resolved"); + assert.equal(login.root_cause, "bad PR"); +}); + +test("claim sets the worker; a second worker is refused", () => { + seed(csv, "build-1", TESTS); + assert.equal(claim(csv, 101, "w1", 1000), true); + assert.equal(claim(csv, 101, "w2", 1000), false); + const row = readRows(csv).find((r) => r.testRunId === "101"); + assert.equal(row.in_flight_worker, "w1"); +}); + +test("heartbeat updates ts only for the owning worker", () => { + seed(csv, "build-1", TESTS); + claim(csv, 101, "w1", 1000); + assert.equal(heartbeat(csv, 101, "w1", 2000), true); + assert.equal(heartbeat(csv, 101, "w2", 3000), false); + assert.equal(readRows(csv).find((r) => r.testRunId === "101").heartbeat_ts, "2000"); +}); + +test("flip records terminal fields, joins related_prs, clears the claim", () => { + seed(csv, "build-1", TESTS); + claim(csv, 101, "w1", 1000); + flip( + csv, + 101, + { rca_done: "resolved", root_cause: "PR #7421", related_prs: ["#7421", "#7430"], confidence: "high" }, + 5000, + ); + const row = readRows(csv).find((r) => r.testRunId === "101"); + assert.equal(row.rca_done, "resolved"); + assert.equal(row.related_prs, "#7421; #7430"); + assert.equal(row.confidence, "high"); + assert.equal(row.in_flight_worker, ""); + assert.equal(row.timestamp, "5000"); +}); + +test("reaper reclaims only stale in-flight rows", () => { + seed(csv, "build-1", TESTS); + claim(csv, 101, "w1", 1000); // stale + claim(csv, 102, "w2", 9000); // fresh + const ttl = 600; // seconds + const now = 1000 + ttl * 1000 + 1; // just past TTL for w1, fresh for w2 + const reclaimed = reaper(csv, ttl, now); + assert.deepEqual(reclaimed, ["101"]); + const rows = readRows(csv); + assert.equal(rows.find((r) => r.testRunId === "101").in_flight_worker, ""); + assert.equal(rows.find((r) => r.testRunId === "101").rca_done, PENDING); + assert.equal(rows.find((r) => r.testRunId === "102").in_flight_worker, "w2"); +}); + +test("reaper leaves terminal rows alone even if in_flight lingered", () => { + seed(csv, "build-1", TESTS); + claim(csv, 101, "w1", 1000); + flip(csv, 101, { rca_done: "resolved" }, 2000); // flip clears in_flight + const reclaimed = reaper(csv, 600, 10_000_000); + assert.deepEqual(reclaimed, []); +}); + +test("pendingRows returns only pending work", () => { + seed(csv, "build-1", TESTS); + flip(csv, 101, { rca_done: "resolved" }, 1000); + const pend = pendingRows(csv); + assert.equal(pend.length, 1); + assert.equal(pend[0].testRunId, "102"); +}); + +test("CSV codec round-trips fields with commas, quotes, newlines", () => { + seed(csv, "build-1", [{ test_id: 200, test_name: "weird" }]); + flip( + csv, + 200, + { rca_done: "resolved", root_cause: 'Failed: "x", got <y>\nsecond line' }, + 1000, + ); + const row = readRows(csv).find((r) => r.testRunId === "200"); + assert.equal(row.root_cause, 'Failed: "x", got <y>\nsecond line'); +}); From bbee37db77095abfda7a1c83a7b42180ef12a901 Mon Sep 17 00:00:00 2001 From: Ruturaj-Browserstack <ruturaj.s@browserstack.com> Date: Tue, 23 Jun 2026 22:03:37 +0530 Subject: [PATCH 04/12] feat(rca): failure-signature clustering + sibling one-turn-confirm protocol lib/signature.mjs computes signature = normalize(category|error|file) off the U1 discovery payload (folds timestamps/uuids/hex/line:col/numbers), groups rows by signature, picks a deterministic representative (non-flaky, then smallest id), and leaves signal-less rows as their own singletons. references/clustering.md documents the O(causes) protocol: representative runs the full loop; siblings pre-seed a one-turn confirm against their own logs with a fall-back-to-own-loop safeguard (never blindly inherit). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> --- lib/signature.mjs | 78 ++++++++++++++++++++++ skills/rca-build/references/clustering.md | 60 +++++++++++++++++ tests/signature.test.mjs | 79 +++++++++++++++++++++++ 3 files changed, 217 insertions(+) create mode 100644 lib/signature.mjs create mode 100644 skills/rca-build/references/clustering.md create mode 100644 tests/signature.test.mjs diff --git a/lib/signature.mjs b/lib/signature.mjs new file mode 100644 index 0000000..42dc0ae --- /dev/null +++ b/lib/signature.mjs @@ -0,0 +1,78 @@ +// Failure-signature clustering (ideation #1). A red build's N failures usually +// trace to a handful of causes; clustering collapses the expensive evidence hunt +// to O(distinct causes). The signature is computed from the trimmed failure +// detail U1 surfaces on each listTestIds row (category + first error line + file +// path) — no extra probe turns. +// +// Dependency-free + deterministic (no crypto, no clock, no random) so it is +// usable from the auto-mode workflow sandbox and trivially testable. + +// Normalize a string for signature comparison: lowercase and fold the volatile +// tokens that make two instances of the SAME failure look different (ids, +// timestamps, hex/uuids, line:col, bare numbers). +export function normalize(value) { + return String(value ?? "") + .toLowerCase() + .replace(/\b\d{4}-\d{2}-\d{2}[t ]\d{2}:\d{2}:\d{2}\S*/g, "<ts>") // ISO timestamps + .replace(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/g, "<uuid>") + .replace(/0x[0-9a-f]+/g, "<hex>") // memory addresses + .replace(/:\d+(:\d+)?\b/g, ":<line>") // file:line(:col) + .replace(/\d+/g, "<n>") // remaining numbers (incl. unit-suffixed, e.g. 3000ms) + .replace(/\s+/g, " ") + .trim(); +} + +// The signature triple: normalized category | error summary | file path. +export function computeSignature(row) { + const category = normalize(row.failure_category); + const error = normalize(row.error_summary); + const file = normalize(row.file_path); + const sig = `${category}|${error}|${file}`; + return sig.replace(/\|/g, "").trim().length === 0 ? "" : sig; +} + +// Deterministic short id for a signature string (FNV-1a → base36). +function hashId(s) { + let h = 0x811c9dc5; + for (let i = 0; i < s.length; i++) { + h ^= s.charCodeAt(i); + h = Math.imul(h, 0x01000193); + } + return (h >>> 0).toString(36); +} + +// A stable representative for a cluster: prefer a non-flaky member (a flaky test +// is a poor exemplar), then the smallest testRunId. Deterministic. +export function selectRepresentative(members) { + return [...members].sort((a, b) => { + const aFlaky = a.is_flaky === "true" || a.is_flaky === true ? 1 : 0; + const bFlaky = b.is_flaky === "true" || b.is_flaky === true ? 1 : 0; + if (aFlaky !== bFlaky) return aFlaky - bFlaky; + return Number(a.testRunId) - Number(b.testRunId); + })[0]; +} + +// Cluster rows by signature. Mutates each row's `cluster_id`. Rows with no +// signal (empty signature) become their own singleton (never merged into a +// catch-all). Returns { rows, clusters } where each cluster carries its +// representative + siblings. +export function clusterRows(rows) { + const groups = new Map(); + + for (const row of rows) { + const sig = computeSignature(row); + const id = sig === "" ? `solo-${row.testRunId}` : `c-${hashId(sig)}`; + row.cluster_id = id; + if (!groups.has(id)) groups.set(id, { cluster_id: id, signature: sig, members: [] }); + groups.get(id).members.push(row); + } + + const clusters = []; + for (const group of groups.values()) { + const representative = selectRepresentative(group.members); + const siblings = group.members.filter((m) => m !== representative); + clusters.push({ ...group, representative, siblings }); + } + + return { rows, clusters }; +} diff --git a/skills/rca-build/references/clustering.md b/skills/rca-build/references/clustering.md new file mode 100644 index 0000000..66face8 --- /dev/null +++ b/skills/rca-build/references/clustering.md @@ -0,0 +1,60 @@ +# Failure-signature clustering + +Why: a red build's N failures usually trace to a handful of causes (one bad +PR/deploy/shared helper). Running the full collaborative loop once per *cause* +instead of once per *test* turns the dominant cost from **O(tests) → O(distinct +causes)** — the only thing that makes "RCA for ALL failed tests, even thousands" +feasible. But **every failed test must still show a per-test RCA in the TRA +dashboard**, so clustering collapses the *evidence hunt*, not the *output*. + +The logic lives in `lib/signature.mjs`; this file is the protocol. + +## The signature + +Computed from the trimmed failure detail `listTestIds(includeFailureDetail=true)` +already returns on each row — **no extra probe turns**: + +``` +signature = normalize(failure_category) | normalize(error_summary) | normalize(file_path) +``` + +`normalize` folds the volatile tokens that make two instances of the *same* +failure look different: ISO timestamps, UUIDs, hex/memory addresses, `file:line:col`, +and bare numbers. So `timeout after 3000ms on node-7` and `timeout after 5000ms +on node-2` share a signature. + +A row with **no signal** (empty category, error, and path) is **not** merged into +a catch-all — it becomes its own singleton (`solo-<testRunId>`). Better an +un-clustered test than a wrong cluster. + +## Representative + siblings + +Each cluster gets: + +- **Representative** — a stable exemplar (non-flaky preferred, then smallest + `testRunId`). Runs the **full multi-turn `ai-tfa-coordinator` loop** → + confirmed root cause + culprit `related_prs`. +- **Siblings** (`N−1`) — each runs its **own** coordinator, **pre-seeded** with + the representative's `root_cause` + suspect PRs. TFA confirms the hypothesis + **against that sibling's own logs in a single turn** → a logs-grounded per-test + RCA in the dashboard at minimal cost. + +Net cost per cluster: **1 deep investigation + (N−1) one-turn confirms.** + +## The safeguard — never blindly inherit + +Distinct failures can share an error string. A sibling's pre-seed turn is a +*hypothesis to confirm*, not a verdict to copy: + +- TFA `RESOLVED`s the sibling in one turn → logs-grounded inheritance, cheap. +- TFA returns `NEEDS_INFO` / `BLOCKED` (the hypothesis does not hold for this + test's logs) → the sibling **falls back to its own full loop**. The + representative's cause is never stamped onto a sibling without log confirmation. + +This keeps correctness independent of the cost optimization: worst case, every +sibling runs its own full loop (same as no clustering); best case, one deep run +covers the whole cluster. + +## Singletons + +A cluster of one is just a plain per-test loop — no pre-seed, no confirm step. diff --git a/tests/signature.test.mjs b/tests/signature.test.mjs new file mode 100644 index 0000000..f721167 --- /dev/null +++ b/tests/signature.test.mjs @@ -0,0 +1,79 @@ +import { test } from "node:test"; +import assert from "node:assert/strict"; +import { + normalize, + computeSignature, + selectRepresentative, + clusterRows, +} from "../lib/signature.mjs"; + +function row(id, extra = {}) { + return { + testRunId: String(id), + failure_category: "Assertion", + error_summary: "expected 200 but got 500", + file_path: "spec/login.rb", + is_flaky: "false", + ...extra, + }; +} + +test("normalize folds timestamps, uuids, hex, line:col, and numbers", () => { + assert.equal(normalize("Error at line :42:7"), "error at line :<line>"); + assert.equal(normalize("got 500 at 0xAF3"), "got <n> at <hex>"); + assert.equal( + normalize("failed 2026-06-23T10:00:00Z"), + "failed <ts>", + ); +}); + +test("identical category+error+path → same cluster", () => { + const { clusters } = clusterRows([row(1), row(2)]); + assert.equal(clusters.length, 1); + assert.equal(clusters[0].members.length, 2); +}); + +test("numbers in the error are folded so siblings still cluster", () => { + const a = row(1, { error_summary: "timeout after 3000ms on node-7" }); + const b = row(2, { error_summary: "timeout after 5000ms on node-2" }); + assert.equal(computeSignature(a), computeSignature(b)); + const { clusters } = clusterRows([a, b]); + assert.equal(clusters.length, 1); +}); + +test("distinct failures → distinct clusters", () => { + const a = row(1, { error_summary: "null pointer in Foo" }); + const b = row(2, { error_summary: "connection refused" }); + const { clusters } = clusterRows([a, b]); + assert.equal(clusters.length, 2); +}); + +test("rows with no signal become their own singletons (no catch-all merge)", () => { + const a = { testRunId: "1", failure_category: "", error_summary: "", file_path: "" }; + const b = { testRunId: "2", failure_category: "", error_summary: "", file_path: "" }; + const { clusters } = clusterRows([a, b]); + assert.equal(clusters.length, 2); + assert.ok(clusters.every((c) => c.cluster_id.startsWith("solo-"))); +}); + +test("singleton cluster has a representative and no siblings", () => { + const { clusters } = clusterRows([row(1)]); + assert.equal(clusters[0].siblings.length, 0); + assert.equal(clusters[0].representative.testRunId, "1"); +}); + +test("representative is deterministic: non-flaky, then smallest testRunId", () => { + const members = [ + row(5, { is_flaky: "true" }), + row(9, { is_flaky: "false" }), + row(7, { is_flaky: "false" }), + ]; + assert.equal(selectRepresentative(members).testRunId, "7"); +}); + +test("clusterRows stamps cluster_id onto every row", () => { + const rows = [row(1), row(2, { error_summary: "different" })]; + clusterRows(rows); + assert.ok(rows.every((r) => r.cluster_id)); + assert.notEqual(rows[0].cluster_id, rows[1].cluster_id); +}); From 7ddb2c222bd9b3b479a3bce82d3ed0c5e75de69c Mon Sep 17 00:00:00 2001 From: Ruturaj-Browserstack <ruturaj.s@browserstack.com> Date: Tue, 23 Jun 2026 22:05:29 +0530 Subject: [PATCH 05/12] feat(rca): build-evidence pre-compute + cache + capability manifest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit buildManifest enumerates the client's discovered capabilities once into capability→{available,via}, declared to the user + TFA so no evidence is asked for that the client provably can't get. lib/evidence-cache.mjs computes the last-green→this-build delta once and caches by (repo,range,evidenceType) — fresh per-run Map, no module globals (multi-tenant-safe) — with resolveBaseline for the never-green fallback. Routes the same grounded window into every coordinator. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> --- lib/evidence-cache.mjs | 47 ++++++++++++ lib/routing.mjs | 31 ++++++++ .../rca-build/references/evidence-routing.md | 29 +++++++ tests/evidence.test.mjs | 76 +++++++++++++++++++ 4 files changed, 183 insertions(+) create mode 100644 lib/evidence-cache.mjs create mode 100644 tests/evidence.test.mjs diff --git a/lib/evidence-cache.mjs b/lib/evidence-cache.mjs new file mode 100644 index 0000000..b9c9523 --- /dev/null +++ b/lib/evidence-cache.mjs @@ -0,0 +1,47 @@ +// Build-level evidence cache (ideation #2). "Diff since last green", "deploy +// timeline", "PRs in the suspect window" are properties of the BUILD, not the +// test — yet a naive loop re-fetches them per test. Compute once, cache by +// (repo, commit-range, evidenceType), and pre-seed every coordinator with the +// same grounded suspect window. Collapses N×M redundant git/infra calls to ~M. +// +// The cache is created fresh per run (function-scoped Map — never a module-level +// global), so it holds no cross-run/cross-user state: in-workspace, single +// session, multi-tenant-safe by construction. + +export function makeEvidenceCache() { + const store = new Map(); + const keyOf = (repo, range, evidenceType) => + `${repo ?? ""}@@${range ?? ""}@@${evidenceType ?? ""}`; + + return { + has(repo, range, evidenceType) { + return store.has(keyOf(repo, range, evidenceType)); + }, + get(repo, range, evidenceType) { + return store.get(keyOf(repo, range, evidenceType)); + }, + set(repo, range, evidenceType, value) { + store.set(keyOf(repo, range, evidenceType), value); + return value; + }, + // Compute-once: run `fn` only on a cache miss; reuse on every later call. + async compute(repo, range, evidenceType, fn) { + const k = keyOf(repo, range, evidenceType); + if (store.has(k)) return store.get(k); + const value = await fn(); + store.set(k, value); + return value; + }, + size() { + return store.size; + }, + }; +} + +// Resolve the baseline ref for the last-green→this-build delta. When there is no +// "last green" (e.g. a never-green flaky suite) fall back to a configured ref and +// flag it so the report can note the weaker grounding. +export function resolveBaseline(lastGreenRef, fallbackRef) { + if (lastGreenRef) return { ref: lastGreenRef, isFallback: false }; + return { ref: fallbackRef ?? null, isFallback: true }; +} diff --git a/lib/routing.mjs b/lib/routing.mjs index 291738e..ec7c4e7 100644 --- a/lib/routing.mjs +++ b/lib/routing.mjs @@ -73,3 +73,34 @@ export function routeAsks(asks, config, manifest = {}) { } return buckets; } + +// ---- capability manifest (ideation #3) ------------------------------------- + +// Build the capability manifest ONCE per run from the capabilities the client +// agent actually discovered. `discovered` is a list of +// { capability, via } the orchestrator collected by asking "what skills/tools +// are available?". Every capability the routing registry references (except the +// TFA-owned test_logs) appears in the manifest, marked available iff discovered. +// Declaring this to TFA lets it avoid asking for evidence the client can't get. +export function buildManifest(config, discovered = []) { + const byCap = new Map(discovered.map((d) => [d.capability, d])); + const manifest = {}; + for (const entry of Object.values(config?.evidenceRouting ?? {})) { + if (entry.skip || entry.owner === "tfa") continue; + const cap = entry.capability; + if (!cap || cap in manifest) continue; + const found = byCap.get(cap); + manifest[cap] = found + ? { available: true, via: found.via ?? null } + : { available: false, via: null }; + } + return manifest; +} + +// Capabilities that will be unavailable this run — declared to the user up front +// ("k8s + metrics not available") and to TFA so it plans asks around them. +export function unavailableCapabilities(manifest) { + return Object.entries(manifest) + .filter(([, v]) => !v.available) + .map(([cap]) => cap); +} diff --git a/skills/rca-build/references/evidence-routing.md b/skills/rca-build/references/evidence-routing.md index e6cc4d0..87fb255 100644 --- a/skills/rca-build/references/evidence-routing.md +++ b/skills/rca-build/references/evidence-routing.md @@ -131,3 +131,32 @@ SUMMARY: not-found | unreachable | unavailable | out-of-scope — <one line: wha An all-`unavailable` / all-`not-found` turn still resubmits — TFA decides whether the gap is fatal (→ BLOCKED) or it can converge anyway (best-effort, lower confidence). The coordinator does not pre-empt that decision. + +--- + +## Capability manifest (built once per run) + +Rather than re-discover "is there a kibana skill?" on every ask across every +test, the orchestrator enumerates the client's available skills/tools **once** up +front into a manifest (`lib/routing.mjs` → `buildManifest`): + +``` +{ github: {available: true, via: "github-mcp"}, k8s: {available: false}, ... } +``` + +- Every ask routes against this manifest — reproducible, no per-ask discovery. +- The orchestrator **declares the unavailable capabilities to the user** up front + ("k8s + metrics will be unavailable") and includes them in the first turn so + TFA plans asks around what's obtainable. +- Frozen at run start. A skill appearing mid-run is not picked up until the next run. + +## Build-level evidence cache (compute once) + +"Diff since last green", "deploy timeline", and "PRs in the suspect window" are +properties of the **build**, not the test. The orchestrator computes the +last-green→this-build delta **once** (`lib/evidence-cache.mjs`), caches it by +`(repo, commit-range, evidenceType)`, and pre-seeds every coordinator with the +same grounded suspect window — collapsing N×M redundant git/infra calls to ~M and +front-loading the highest-signal evidence so many tests RESOLVE before any infra +ask fires. No "last green" (never-green suite) → fall back to a configured +baseline ref and note the weaker grounding in the report. diff --git a/tests/evidence.test.mjs b/tests/evidence.test.mjs new file mode 100644 index 0000000..e01b06a --- /dev/null +++ b/tests/evidence.test.mjs @@ -0,0 +1,76 @@ +import { test } from "node:test"; +import assert from "node:assert/strict"; +import { buildManifest, unavailableCapabilities } from "../lib/routing.mjs"; +import { makeEvidenceCache, resolveBaseline } from "../lib/evidence-cache.mjs"; + +const CONFIG = { + evidenceRouting: { + test_logs: { owner: "tfa", skip: true }, + product_code: { capability: "github" }, + deploy: { capability: "github" }, + k8s: { capability: "k8s" }, + metrics: { capability: "metrics" }, + other: { capability: "other" }, + }, +}; + +test("buildManifest marks discovered capabilities available with via", () => { + const manifest = buildManifest(CONFIG, [ + { capability: "github", via: "github-mcp" }, + ]); + assert.equal(manifest.github.available, true); + assert.equal(manifest.github.via, "github-mcp"); + assert.equal(manifest.k8s.available, false); +}); + +test("buildManifest excludes the TFA-owned test_logs capability", () => { + const manifest = buildManifest(CONFIG, []); + assert.ok(!("undefined" in manifest)); + assert.ok(!Object.keys(manifest).includes("test_logs")); +}); + +test("buildManifest dedupes capabilities shared by multiple evidence types", () => { + // product_code + deploy both map to github → one manifest entry + const manifest = buildManifest(CONFIG, [{ capability: "github" }]); + assert.equal(Object.keys(manifest).filter((k) => k === "github").length, 1); +}); + +test("unavailableCapabilities lists what the client can't get", () => { + const manifest = buildManifest(CONFIG, [{ capability: "github" }]); + const unavailable = unavailableCapabilities(manifest).sort(); + assert.deepEqual(unavailable, ["k8s", "metrics", "other"]); +}); + +test("evidence cache computes once and reuses across calls", async () => { + const cache = makeEvidenceCache(); + let calls = 0; + const fn = async () => { + calls++; + return { prs: ["#1"] }; + }; + const a = await cache.compute("repo", "abc..def", "deploy", fn); + const b = await cache.compute("repo", "abc..def", "deploy", fn); + assert.equal(calls, 1); + assert.deepEqual(a, b); + assert.equal(cache.size(), 1); +}); + +test("evidence cache key distinguishes commit ranges", async () => { + const cache = makeEvidenceCache(); + let calls = 0; + const fn = async () => ++calls; + await cache.compute("repo", "r1", "deploy", fn); + await cache.compute("repo", "r2", "deploy", fn); + assert.equal(calls, 2); +}); + +test("resolveBaseline uses last-green when present, else flags fallback", () => { + assert.deepEqual(resolveBaseline("v1.2.3", "main"), { + ref: "v1.2.3", + isFallback: false, + }); + assert.deepEqual(resolveBaseline(null, "main"), { + ref: "main", + isFallback: true, + }); +}); From d6f0452298a8a923105bf473388b7d5bfd1b50ce Mon Sep 17 00:00:00 2001 From: Ruturaj-Browserstack <ruturaj.s@browserstack.com> Date: Tue, 23 Jun 2026 22:07:26 +0530 Subject: [PATCH 06/12] feat(rca): auto-mode dynamic workflow (rca-batch) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit workflows/rca-batch.mjs orchestrates the batch in auto mode: a pipeline over clusters dispatches ai-tfa-coordinator agents — representative full loop → siblings one-turn-confirm, no barrier between stages — with a structured RCA schema. Sandbox-correct: does no state I/O itself (orchestrator passes the clustered work-list + manifest + pre-computed build evidence via args; each coordinator agent persists its own CSV row eagerly). Gap → 'unavailable' back to TFA, no user prompt. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> --- workflows/rca-batch.mjs | 130 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 workflows/rca-batch.mjs diff --git a/workflows/rca-batch.mjs b/workflows/rca-batch.mjs new file mode 100644 index 0000000..e577dbe --- /dev/null +++ b/workflows/rca-batch.mjs @@ -0,0 +1,130 @@ +export const meta = { + name: "rca-batch", + description: + "Drive collaborative RCA over all failed tests of a build (auto mode): cluster representatives run the full loop, siblings one-turn-confirm, ~5 concurrent.", + phases: [ + { title: "Representatives", detail: "full multi-turn RCA per cluster" }, + { title: "Siblings", detail: "one-turn confirm against own logs" }, + ], +}; + +// AUTO MODE orchestration (D2). This is a dynamic-workflow script: it runs in the +// Workflow sandbox (no filesystem, no Date.now/Math.random, agent()/pipeline() +// as globals). It therefore does NO state I/O itself — the orchestrator seeds the +// CSV, clusters, and builds the manifest in normal context and passes the +// work-list via `args`; each dispatched `ai-tfa-coordinator` agent (which HAS +// tool access) claims + flips its own CSV row eagerly (WAL); this script +// orchestrates concurrency and returns the structured results for reconciliation. +// +// args shape: +// { +// csvPath, buildId, mode: "auto", +// manifest: { capability: { available, via } }, +// buildEvidence: { baselineRef, suspectWindow, ... }, // pre-computed once +// clusters: [ +// { cluster_id, representative: { testRunId, testName, error_summary }, +// siblings: [ { testRunId, testName, error_summary } ] } +// ] +// } + +const RCA_SCHEMA = { + type: "object", + required: ["testRunId", "status"], + properties: { + testRunId: { type: "string" }, + status: { enum: ["RESOLVED", "BLOCKED", "PENDING", "failed"] }, + confidence: { enum: ["high", "medium", "low", "unknown"] }, + root_cause: { type: "string" }, + possible_fix: { type: "string" }, + related_prs: { type: "array", items: { type: "string" } }, + suspect_signals: { type: "array", items: { type: "string" } }, + threadId: { type: "string" }, + turnId: { type: "string" }, + turns_used: { type: "number" }, + asks_fulfilled: { type: "array", items: { type: "string" } }, + asks_skipped: { type: "array", items: { type: "string" } }, + asks_unavailable: { type: "array", items: { type: "string" } }, + cluster_id: { type: "string" }, + }, + additionalProperties: true, +}; + +const ctx = args ?? {}; +const clusters = ctx.clusters ?? []; +const shared = [ + `CSV state file: ${ctx.csvPath}`, + `Capability manifest: ${JSON.stringify(ctx.manifest ?? {})}`, + `Build-level evidence (pre-computed once, reuse — do not re-fetch): ${JSON.stringify(ctx.buildEvidence ?? {})}`, + `Mode: auto — on an evidence gap with no capability, report "unavailable" back to TFA (NEVER prompt a user). Best-effort finalize.`, + `Persist eagerly to the CSV: claim your row before turn 1, flip it on terminal (lib/csv-state.mjs).`, +].join("\n"); + +function repPrompt(cluster) { + const r = cluster.representative; + return [ + `You are the ai-tfa-coordinator for cluster ${cluster.cluster_id}.`, + `Run the FULL collaborative RCA loop for the representative test.`, + `testRunId=${r.testRunId} testName=${r.testName ?? ""}`, + `error_digest: ${r.error_summary ?? "(none)"}`, + shared, + `Return the structured RCA_OUTPUT for this test.`, + ].join("\n"); +} + +function siblingPrompt(sibling, repResult, cluster) { + return [ + `You are the ai-tfa-coordinator for a SIBLING of cluster ${cluster.cluster_id}.`, + `Pre-seed: the representative resolved as:`, + ` root_cause: ${repResult?.root_cause ?? "(representative did not resolve)"}`, + ` related_prs: ${JSON.stringify(repResult?.related_prs ?? [])}`, + `State this hypothesis on turn 1 and ask TFA to CONFIRM it against THIS test's own logs.`, + `If TFA confirms in one turn → done. If it does NOT (NEEDS_INFO/BLOCKED), fall back to the full loop — never blindly inherit.`, + `testRunId=${sibling.testRunId} testName=${sibling.testName ?? ""}`, + `error_digest: ${sibling.error_summary ?? "(none)"}`, + shared, + `Return the structured RCA_OUTPUT for this test.`, + ].join("\n"); +} + +log(`Auto-mode batch: ${clusters.length} cluster(s) over build ${ctx.buildId ?? "?"}`); + +// Pipeline: each cluster flows representative → siblings independently (no barrier +// between stages), so a small cluster's siblings confirm while a big cluster's +// representative is still looping. Concurrency is bounded by the workflow runtime +// (~min(16, cores-2)); config.concurrency (5) is the intended soft target. +const results = await pipeline( + clusters, + (cluster) => + agent(repPrompt(cluster), { + label: `rep:${cluster.representative.testRunId}`, + phase: "Representatives", + agentType: "ai-tfa-coordinator", + schema: RCA_SCHEMA, + }).then((rca) => ({ cluster, rca })), + ({ cluster, rca }) => + parallel( + (cluster.siblings ?? []).map((sib) => () => + agent(siblingPrompt(sib, rca, cluster), { + label: `sib:${sib.testRunId}`, + phase: "Siblings", + agentType: "ai-tfa-coordinator", + schema: RCA_SCHEMA, + }), + ), + ).then((sibs) => ({ + cluster_id: cluster.cluster_id, + representative: rca, + siblings: sibs.filter(Boolean), + })), +); + +const flat = results.filter(Boolean); +const all = flat.flatMap((r) => [r.representative, ...(r.siblings ?? [])]).filter(Boolean); +const byStatus = all.reduce((acc, r) => { + acc[r.status] = (acc[r.status] ?? 0) + 1; + return acc; +}, {}); + +log(`Auto-mode batch complete: ${all.length} test(s) — ${JSON.stringify(byStatus)}`); + +return { clusters: flat.length, tests: all.length, byStatus, results: flat }; From 28ebc1d29c2d6501098ea446e105c5f01e97fd18 Mon Sep 17 00:00:00 2001 From: Ruturaj-Browserstack <ruturaj.s@browserstack.com> Date: Tue, 23 Jun 2026 22:08:45 +0530 Subject: [PATCH 07/12] =?UTF-8?q?feat(rca):=20interactive=20mode=20?= =?UTF-8?q?=E2=80=94=20subagents=20with=20user-in-the-loop=20gap-return?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit references/interactive-mode.md specifies the orchestrator loop: spawn ai-tfa-coordinator subagents 5 at a time; a subagent cannot pause to prompt the user, so on an evidence gap it ends early with a GAP_OUTPUT carrying resume handles (threadId+turnId); the orchestrator asks A1, then re-dispatches with resume= and the answer. Same coordinator as auto — only the gap action differs. Compact blocks not transcripts (lean main context); partial-first; auto-first/ escalate-the-residue noted. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> --- agents/ai-tfa-coordinator.md | 31 ++++++++-- skills/rca-build/SKILL.md | 7 ++- .../rca-build/references/interactive-mode.md | 56 +++++++++++++++++++ 3 files changed, 86 insertions(+), 8 deletions(-) create mode 100644 skills/rca-build/references/interactive-mode.md diff --git a/agents/ai-tfa-coordinator.md b/agents/ai-tfa-coordinator.md index e2045fb..4d42f17 100644 --- a/agents/ai-tfa-coordinator.md +++ b/agents/ai-tfa-coordinator.md @@ -71,12 +71,33 @@ capability available for that `evidenceType`): - **auto** → emit an `unavailable` block back to TFA (no user prompt). TFA finalizes best-effort with lower confidence. -- **interactive** → **return the gap to the caller** (the main agent), which asks - the user (A1) for that data, then feeds the answer back. A subagent cannot - prompt the user itself. +- **interactive** → a subagent cannot pause to prompt the user, so **end the run + early and return a `GAP_OUTPUT` block** (status `PENDING`) carrying the resume + handles + the gap. The orchestrator asks A1, then **re-dispatches a coordinator + with `resume={threadId, turnId}`** and the answer digested into the next turn. + See `references/interactive-mode.md`. -Everything else — the loop, routing, digest, caps, output — is identical across -modes. Do not fork the loop; only the gap action differs. +`GAP_OUTPUT` block (interactive gap only): + +``` +GAP_OUTPUT_START +## testRunId +<integer> +## thread_id +<threadId> +## turn_id +<turnId> # resume handle +## gap +- evidenceType: <type> +- what: <verbatim ask `what`> +- why: <verbatim ask `why`> +GAP_OUTPUT_END +``` + +Everything else — the loop, routing, digest, caps, terminal output — is identical +across modes. Do not fork the loop; only the gap action differs. When all gaps in +a turn are resolvable (gathered or user-answered), the loop proceeds normally to a +terminal `RCA_OUTPUT`. ## The loop diff --git a/skills/rca-build/SKILL.md b/skills/rca-build/SKILL.md index 4bf6b34..5248bee 100644 --- a/skills/rca-build/SKILL.md +++ b/skills/rca-build/SKILL.md @@ -92,9 +92,10 @@ representatives deep, siblings one-turn-confirm. Eagerly persist to the CSV/WAL - **auto** → run the dynamic workflow `workflows/rca-batch.mjs` (script-orchestrated, no user input; gap → "unavailable" back to TFA → best-effort finalize). - **interactive** → spawn `ai-tfa-coordinator` subagents 5 at a time; on an - evidence gap a subagent returns the gap to this orchestrator, which asks the - user (A1), then feeds the answer back. Subagents return compact `RCA_OUTPUT` - blocks, not transcripts (keeps the main context lean for large batches). + evidence gap a subagent ends early with a `GAP_OUTPUT` (resume handles), and + this orchestrator asks the user (A1) then re-dispatches with `resume=`. Subagents + return compact blocks, not transcripts (keeps the main context lean for large + batches). Full protocol: `references/interactive-mode.md`. Both modes use the **same** `ai-tfa-coordinator`; only the injected gap-resolver differs. A coordinator that dies becomes a recorded `failed` row — one stuck test diff --git a/skills/rca-build/references/interactive-mode.md b/skills/rca-build/references/interactive-mode.md new file mode 100644 index 0000000..493ea65 --- /dev/null +++ b/skills/rca-build/references/interactive-mode.md @@ -0,0 +1,56 @@ +# Interactive mode — subagents with a user in the loop + +Interactive mode (D2) puts the human (A1) in the loop **only at the orchestrator +layer**. The main session spawns `ai-tfa-coordinator` subagents to investigate in +parallel; when a subagent needs evidence it can't get, it hands the gap back up to +the orchestrator, which asks the user and feeds the answer down. + +This is the **same coordinator** the auto workflow uses — only the gap-resolver +differs (auto → "unavailable"; interactive → return the gap). + +## Why a subagent can't just "ask the user" + +A dispatched subagent runs to completion and returns one final message — it +cannot pause mid-run, prompt the user, and resume. So the gap-return is modeled +as **early termination with resume handles**, and the orchestrator drives the +ask-and-resume loop. + +## The orchestrator loop (per batch of ≤ `concurrency`, default 5) + +``` +1. Take the next ≤5 pending work items (representatives first, then siblings). +2. Dispatch one ai-tfa-coordinator subagent per item, mode=interactive, passing + the manifest + pre-computed build evidence + (for siblings) the pre-seed. +3. Each subagent runs its loop until either: + - a terminal status → returns RCA_OUTPUT (the orchestrator flips the CSV row), or + - an interactive GAP → returns GAP_OUTPUT (status=PENDING) carrying: + { testRunId, threadId, turnId, gap: { evidenceType, what, why } } +4. For each GAP_OUTPUT: ASK A1 for that evidence (one focused question). + - A1 answers → re-dispatch a coordinator with resume={threadId,turnId} and + the answer digested into the next turn's message. Continue its loop. + - A1 has nothing → tell the coordinator to report "unavailable" on resume + (degrade exactly like auto for that one ask). +5. Repeat until every row is terminal. Then dispatch the next batch. +``` + +## Aggregation discipline (large batches) + +Subagents return **compact `RCA_OUTPUT` / `GAP_OUTPUT` blocks, never transcripts** +— mirroring the auto workflow's "results in script vars" rule — so the main +agent's context stays lean even over hundreds of tests. The orchestrator never +holds full per-test loop transcripts; it holds one block per test. + +## Partial-first + +A subagent that dies becomes a recorded `failed` row (the orchestrator +synthesizes it). One stuck test never sinks the batch — same contract as auto. + +## When to prefer interactive over auto + +- The client is missing infra skills the failures clearly need (k8s/kibana), and + the user can supply that evidence by hand. +- The user wants to steer or sign off mid-run. + +Otherwise auto is cheaper (no human round-trips). Both write the same CSV rows +and the same report, so a run can start auto and the residual BLOCKED/gap tests +can be re-run interactively (the auto-first / escalate-the-residue pattern). From e8b70e644225bc448fc47a0508b61223456cf07a Mon Sep 17 00:00:00 2001 From: Ruturaj-Browserstack <ruturaj.s@browserstack.com> Date: Tue, 23 Jun 2026 22:09:44 +0530 Subject: [PATCH 08/12] feat(rca): suspect-PR falsification packet + GitHub-evidence spec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit references/github-evidence.md specifies exactly what each github ask needs (diff-since-baseline, PRs-in-window touching the failing path, blame, deploy timing) and the discovery order GitHub MCP → gh → degrade — no shipped forensics harness. Adds the adversarial falsification protocol (path overlap / deploy-state guard / direction) so only verdict:supported suspects enter related_prs; ruled-out suspects stay as disconfirming evidence. Coordinator runs it for product_code/ deploy/ci asks, reusing the pre-computed build evidence. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> --- agents/ai-tfa-coordinator.md | 12 +++ .../rca-build/references/github-evidence.md | 77 +++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 skills/rca-build/references/github-evidence.md diff --git a/agents/ai-tfa-coordinator.md b/agents/ai-tfa-coordinator.md index 4d42f17..60b9753 100644 --- a/agents/ai-tfa-coordinator.md +++ b/agents/ai-tfa-coordinator.md @@ -99,6 +99,18 @@ across modes. Do not fork the loop; only the gap action differs. When all gaps i a turn are resolvable (gathered or user-answered), the loop proceeds normally to a terminal `RCA_OUTPUT`. +## Suspect-PR falsification (github asks) + +For `product_code` / `deploy` / `ci` asks, follow `references/github-evidence.md`: +gather the **exact** evidence (diff-since-baseline, PRs-in-window touching the +failing path, blame, deploy timing) via **GitHub MCP → `gh` → degrade**, and for +each candidate suspect **try to disprove it** (path overlap? shipped before the +failure window? behind an OFF flag?). Feed both supporting *and* disconfirming +evidence back as a structured suspect packet; only `verdict: supported` suspects +belong in `related_prs`. Reuse the pre-computed build-level evidence — do not +re-fetch per test. Never fabricate a PR when the github capability is unavailable +— emit an `unavailable` block. + ## The loop ``` diff --git a/skills/rca-build/references/github-evidence.md b/skills/rca-build/references/github-evidence.md new file mode 100644 index 0000000..fa24aaa --- /dev/null +++ b/skills/rca-build/references/github-evidence.md @@ -0,0 +1,77 @@ +# GitHub evidence — what to gather, and how to rule a suspect OUT + +The worst automated-RCA outcome is **confidently blaming an innocent PR**. This +file is the contract for `product_code` / `deploy` / `ci` asks (the `github` +capability): the **exact** evidence to gather, and a **falsification protocol** +that tries to *disprove* each suspect before it enters `related_prs`. + +> We do **not** ship a GitHub forensics harness or MCP tool. We specify what's +> needed and use whatever the client already has — **GitHub MCP if available, +> else `gh`, else degrade** to an `unavailable` block. + +## Capability discovery (in order) + +1. **GitHub MCP** (`mcp__github__*`) — preferred for structured PR/diff/blame queries. +2. **`gh` CLI** — fall back for git-graph operations (`gh pr list --search`, + `gh api`, `merge-base`, ancestry) and anything the MCP doesn't cover. +3. **Neither** → emit an `unavailable` block for the ask (do not fabricate a PR). + +The orchestrator records which is present in the capability manifest +(`capability: github → { available, via }`); route every github ask against it. + +## Evidence each ask needs (be specific — no fishing) + +| Ask intent | Gather exactly | +|---|---| +| "Did `<X>` change since the last passing run?" | the diff of `<X>`'s file/function between the **baseline ref** (last-green, or the configured fallback) and the build's commit — not the whole repo diff | +| "Which PRs are suspect?" | PRs **merged in the window** `(baselineRef, build commit]` that **touch the failing code path** — intersect changed files with the failing file/function | +| "Who/what last changed the failing line?" | `blame` on the specific failing lines (from the test's `file_path` + the error) | +| "What shipped to the run's env before the failure?" | deploy timeline (`gh` releases/tags + the env's deploy record); compare deploy time vs. the run's `started_at` | +| "Did CI change?" | the workflow-file diff + recent `gh run` history for the failing job | + +Scope everything by the failing test's `file_path` + the error summary. The +build-level evidence (diff-since-last-green, PR window) is **pre-computed once** +and passed in — reuse it; do not re-fetch per test. + +## Falsification protocol — rule out, don't just rule in + +For **each** candidate suspect PR, try to **break** the hypothesis: + +1. **Path overlap.** Do the PR's changed hunks actually touch the failing code + path (the function/line in the stack)? No overlap → **ruled out**. +2. **Deployment-state guard.** Was the PR's code actually **live** in the run's + env at `started_at`? If it shipped *after* the failure window, or sits behind + an **OFF** flag, it could not have caused this failure → **ruled out**. +3. **Direction.** Does the change plausibly produce *this* error (e.g. a validator + tightened to reject the input the test sends)? If the change is unrelated to + the symptom → **weak**, mark accordingly. + +Feed **both supporting and disconfirming** evidence back to TFA. A suspect that +survives 1–3 is a real candidate; one that fails any is reported as ruled-out +(with the reason), **not** dropped silently. + +## The suspect packet (structured, not free text) + +Each surviving/ruled-out suspect is one structured block so `related_prs` +populates deterministically: + +``` +SUSPECT: + pr: <#number> + files: <changed files overlapping the failing path> + hunks: <the 1-3 load-bearing changed hunks — see digest size caps> + author: <login> + merged_at: <ts> vs last_green: <ts> vs started_at: <ts> + verdict: supported | ruled-out (<reason: no-path-overlap | shipped-after | behind-off-flag | unrelated>) + link: <PR permalink> +``` + +Only `verdict: supported` suspects should end up in TFA's `related_prs`. Ruled-out +suspects stay in the thread as disconfirming evidence so TFA (and a human) can see +the elimination, not just the conclusion. + +## Digest discipline + +Same caps as `references/evidence-routing.md`: prefer a PR **link** over pasting a +diff; at most 1 hunk (3 hard) per `product_code` snippet; never paste a full diff. +The packet is *findings*, not the haystack. From c5a4e9ecb219a3fd52ca7bdecc0365069f543b6d Mon Sep 17 00:00:00 2001 From: Ruturaj-Browserstack <ruturaj.s@browserstack.com> Date: Tue, 23 Jun 2026 22:11:34 +0530 Subject: [PATCH 09/12] feat(rca): coverage stamp + degrade-don't-crash report (resume reaper in U4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lib/coverage.mjs derives a per-row evidence-coverage band — TFA confidence capped by coverage (full keeps it, partial→medium, thin→low) so a RESOLVED built with evidence unavailable reads as lower confidence BECAUSE of the gap. lib/report.mjs renders the CSV to markdown: status counts + per-test table + coverage caveats, degrading missing fields to 'not available' and never crashing on an empty/partial batch. report-format.md documents the stamp, layout, and the startup reaper resume path. Blast-radius digest explicitly deferred. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> --- lib/coverage.mjs | 39 +++++++ lib/report.mjs | 69 ++++++++++++ skills/rca-build/references/report-format.md | 45 ++++++++ tests/coverage-report.test.mjs | 108 +++++++++++++++++++ 4 files changed, 261 insertions(+) create mode 100644 lib/coverage.mjs create mode 100644 lib/report.mjs create mode 100644 skills/rca-build/references/report-format.md create mode 100644 tests/coverage-report.test.mjs diff --git a/lib/coverage.mjs b/lib/coverage.mjs new file mode 100644 index 0000000..caff2d2 --- /dev/null +++ b/lib/coverage.mjs @@ -0,0 +1,39 @@ +// Evidence-coverage stamp (ideation #6, v1 — the per-row coverage band; the +// build-level blast-radius digest is deferred). A RESOLVED RCA built with +// k8s+kibana+metrics all "unavailable" must not read like one with full +// evidence. The client (which routed every ask) stamps each row with a coverage +// vector and derives a coverage-capped confidence band the reviewer sees: +// "low confidence BECAUSE kibana was unavailable", not "low confidence, trust me". + +const BAND_ORDER = ["low", "medium", "high"]; + +// coverage classification from what was fulfilled vs. left unavailable. +export function classifyCoverage(asksFulfilled = [], asksUnavailable = []) { + const unavailable = [...new Set(asksUnavailable.filter(Boolean))]; + const fulfilled = [...new Set(asksFulfilled.filter(Boolean))]; + if (unavailable.length === 0) return "full"; + if (fulfilled.length > 0) return "partial"; + return "thin"; +} + +// Cap the band: full coverage keeps TFA's confidence; partial caps at medium; +// thin caps at low. Unknown/absent TFA confidence floors to low. +function capBand(tfaConfidence, coverage) { + const base = BAND_ORDER.includes(tfaConfidence) ? tfaConfidence : "low"; + const cap = coverage === "full" ? "high" : coverage === "partial" ? "medium" : "low"; + return BAND_ORDER[Math.min(BAND_ORDER.indexOf(base), BAND_ORDER.indexOf(cap))]; +} + +// The stamp written to a row at flip time. Returns { coverage, band, unavailable }. +export function coverageStamp({ + asksFulfilled = [], + asksUnavailable = [], + tfaConfidence = "unknown", +} = {}) { + const coverage = classifyCoverage(asksFulfilled, asksUnavailable); + return { + coverage, + band: capBand(tfaConfidence, coverage), + unavailable: [...new Set(asksUnavailable.filter(Boolean))], + }; +} diff --git a/lib/report.mjs b/lib/report.mjs new file mode 100644 index 0000000..84ae275 --- /dev/null +++ b/lib/report.mjs @@ -0,0 +1,69 @@ +// Deterministic markdown report for a finished (or partial) batch. Degrade, +// don't crash: any missing field renders as "not available"; an empty batch +// still renders a valid report. Reads the CSV/WAL spine; no per-test transcripts. + +import { readRows } from "./csv-state.mjs"; + +const NA = "not available"; + +function cell(value) { + const s = value == null ? "" : String(value).trim(); + if (s === "") return NA; + // keep the table one-line-per-row: collapse newlines, escape pipes + return s.replace(/\s*\n\s*/g, " ").replace(/\|/g, "\\|"); +} + +function countBy(rows, key) { + return rows.reduce((acc, r) => { + const k = r[key] || "unknown"; + acc[k] = (acc[k] ?? 0) + 1; + return acc; + }, {}); +} + +// Render from a rows array (testable) — or pass a csvPath via renderReportFromCsv. +export function renderReport(rows, { buildId, generatedAt } = {}) { + const lines = []; + lines.push(`# RCA report${buildId ? ` — build ${buildId}` : ""}`); + if (generatedAt) lines.push(`\nGenerated: ${generatedAt}`); + + if (!rows || rows.length === 0) { + lines.push("\nNo failed tests analyzed."); + return lines.join("\n") + "\n"; + } + + const byState = countBy(rows, "rca_done"); + const summary = Object.entries(byState) + .map(([k, v]) => `${k}: ${v}`) + .join(" · "); + lines.push(`\n**${rows.length} test(s)** — ${summary}\n`); + + lines.push( + "| testRunId | test | status | confidence | coverage | root cause | related PRs |", + ); + lines.push("|---|---|---|---|---|---|---|"); + for (const r of rows) { + lines.push( + `| ${cell(r.testRunId)} | ${cell(r.testName)} | ${cell(r.rca_done)} | ${cell( + r.confidence, + )} | ${cell(r.coverage)} | ${cell(r.root_cause)} | ${cell(r.related_prs)} |`, + ); + } + + // Surface coverage caveats so a "low confidence" reads as "because X unavailable". + const thin = rows.filter((r) => r.coverage === "thin" || r.coverage === "partial"); + if (thin.length > 0) { + lines.push(`\n## Coverage caveats`); + for (const r of thin) { + lines.push( + `- ${cell(r.testRunId)} (${cell(r.coverage)} coverage): confidence band reflects evidence that was unavailable, not just model certainty.`, + ); + } + } + + return lines.join("\n") + "\n"; +} + +export function renderReportFromCsv(csvPath, opts = {}) { + return renderReport(readRows(csvPath), opts); +} diff --git a/skills/rca-build/references/report-format.md b/skills/rca-build/references/report-format.md new file mode 100644 index 0000000..e27a28d --- /dev/null +++ b/skills/rca-build/references/report-format.md @@ -0,0 +1,45 @@ +# Report format, coverage stamp, and resume + +## The CSV is the source of truth + +Every per-test result lives as one CSV row (`lib/csv-state.mjs`, columns in +`COLUMNS`). The report is a deterministic render of that CSV — no per-test +transcripts are kept. `rca_done` ∈ `pending | resolved | blocked | failed | +pending-resume`. + +## Coverage stamp (ideation #6, v1) + +At flip time the orchestrator stamps each row (`lib/coverage.mjs`) from the +coordinator's `asks_fulfilled` / `asks_unavailable` + TFA's confidence: + +- **coverage** — `full` (no gaps) · `partial` (some fulfilled, some unavailable) · + `thin` (nothing fulfilled, only gaps). +- **band** — TFA's confidence **capped by coverage**: `full` keeps it, `partial` + caps at `medium`, `thin` caps at `low`; unknown floors to `low`. + +So a RESOLVED with kibana/k8s unavailable reads as a lower band *because* evidence +was missing — not the same as a fully-evidenced RESOLVED. The report's **Coverage +caveats** section spells this out per affected row. + +> Out of v1 scope: the build-level **blast-radius digest** (rows inverted by +> culprit PR, ranked) — deferred to follow-up. The per-row coverage stamp ships now. + +## Report layout (`lib/report.mjs` → `renderReport`) + +- Header + build id + generated-at. +- One-line summary: total + counts by `rca_done`. +- A per-test table: `testRunId | test | status | confidence | coverage | root cause | related PRs`. +- A **Coverage caveats** list for `partial`/`thin` rows. + +**Degrade, don't crash:** any missing field renders as `not available`; an empty +batch renders "No failed tests analyzed."; pipes are escaped and newlines +collapsed so the table never breaks. + +## Resume (ideation #7) + +On startup the orchestrator runs the **reaper** (`lib/csv-state.mjs` → `reaper`): +rows stuck `in_flight` with a heartbeat older than `reaperHeartbeatTtlSec` are +reclaimed to `pending` (a crashed worker's rows), then fan-out re-points at the +CSV. A row that retains a live `threadId`/`turnId` resumes that TFA thread; a dead +thread re-runs from `pending`. In-session / in-workspace only — cross-session +durability is deferred. diff --git a/tests/coverage-report.test.mjs b/tests/coverage-report.test.mjs new file mode 100644 index 0000000..7d7cb94 --- /dev/null +++ b/tests/coverage-report.test.mjs @@ -0,0 +1,108 @@ +import { test } from "node:test"; +import assert from "node:assert/strict"; +import { coverageStamp, classifyCoverage } from "../lib/coverage.mjs"; +import { renderReport } from "../lib/report.mjs"; + +// ---- coverage stamp -------------------------------------------------------- + +test("full coverage keeps TFA confidence", () => { + const s = coverageStamp({ + asksFulfilled: ["product_code"], + asksUnavailable: [], + tfaConfidence: "high", + }); + assert.equal(s.coverage, "full"); + assert.equal(s.band, "high"); +}); + +test("partial coverage caps a high TFA confidence at medium", () => { + const s = coverageStamp({ + asksFulfilled: ["product_code"], + asksUnavailable: ["kibana"], + tfaConfidence: "high", + }); + assert.equal(s.coverage, "partial"); + assert.equal(s.band, "medium"); + assert.deepEqual(s.unavailable, ["kibana"]); +}); + +test("thin coverage (nothing fulfilled, gaps) caps at low", () => { + const s = coverageStamp({ + asksFulfilled: [], + asksUnavailable: ["k8s", "metrics"], + tfaConfidence: "high", + }); + assert.equal(s.coverage, "thin"); + assert.equal(s.band, "low"); +}); + +test("unknown TFA confidence floors to low even at full coverage", () => { + const s = coverageStamp({ asksFulfilled: [], asksUnavailable: [], tfaConfidence: "unknown" }); + assert.equal(s.coverage, "full"); + assert.equal(s.band, "low"); +}); + +test("classifyCoverage dedupes and handles empties", () => { + assert.equal(classifyCoverage(["a", "a"], []), "full"); + assert.equal(classifyCoverage([], ["x"]), "thin"); +}); + +// ---- report ---------------------------------------------------------------- + +test("empty batch renders a valid report, no crash", () => { + const md = renderReport([], { buildId: "b1" }); + assert.match(md, /No failed tests analyzed/); +}); + +test("report renders a row table with status counts", () => { + const rows = [ + { + testRunId: "101", + testName: "login", + rca_done: "resolved", + confidence: "high", + coverage: "full", + root_cause: "PR #7421 tightened validator", + related_prs: "#7421", + }, + { + testRunId: "102", + testName: "checkout", + rca_done: "blocked", + confidence: "", + coverage: "", + root_cause: "", + related_prs: "", + }, + ]; + const md = renderReport(rows, { buildId: "b1" }); + assert.match(md, /2 test\(s\)/); + assert.match(md, /resolved: 1/); + assert.match(md, /blocked: 1/); + assert.match(md, /101/); + assert.match(md, /not available/); // 102's blank fields degrade +}); + +test("report escapes pipes and collapses newlines in cells", () => { + const rows = [ + { + testRunId: "1", + testName: "t", + rca_done: "resolved", + root_cause: "a | b\nsecond line", + related_prs: "#1", + }, + ]; + const md = renderReport(rows); + assert.ok(!md.includes("a | b\nsecond")); + assert.match(md, /a \\\| b second line/); +}); + +test("report surfaces coverage caveats for thin/partial rows", () => { + const rows = [ + { testRunId: "1", testName: "t", rca_done: "resolved", coverage: "partial" }, + ]; + const md = renderReport(rows); + assert.match(md, /Coverage caveats/); + assert.match(md, /confidence band reflects evidence that was unavailable/); +}); From e277811ecbdd58fc3bdbf18732615560a7cc26cb Mon Sep 17 00:00:00 2001 From: Ruturaj-Browserstack <ruturaj.s@browserstack.com> Date: Tue, 23 Jun 2026 22:14:25 +0530 Subject: [PATCH 10/12] feat(rca): conformance fixture + executable loop mirror / sequential harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lib/loop.mjs (runRcaLoop) is an executable mirror of the coordinator loop — status branching, ask routing, gap resolution, turn-cap, one-thread, soft-PENDING — driven by an injected submit(). It doubles as the D5 sequential thin-client harness. tests/conformance.test.mjs replays recorded tfaRcaTurn transcripts (resolved/blocked/pending/turn-cap fixtures) and proves: rca capture, test_logs skip, soft-PENDING no-re-poll, turn-cap never submits a 7th turn, and the degraded (no-capability auto) path still reaches a valid terminal RCA — same loop, same result. 48 tests green. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> --- agents/ai-tfa-coordinator.md | 6 + lib/loop.mjs | 125 ++++++++++++++++++ tests/conformance.test.mjs | 133 ++++++++++++++++++++ tests/fixtures/recorded-turns/blocked.json | 13 ++ tests/fixtures/recorded-turns/pending.json | 12 ++ tests/fixtures/recorded-turns/resolved.json | 37 ++++++ tests/fixtures/recorded-turns/turn-cap.json | 12 ++ 7 files changed, 338 insertions(+) create mode 100644 lib/loop.mjs create mode 100644 tests/conformance.test.mjs create mode 100644 tests/fixtures/recorded-turns/blocked.json create mode 100644 tests/fixtures/recorded-turns/pending.json create mode 100644 tests/fixtures/recorded-turns/resolved.json create mode 100644 tests/fixtures/recorded-turns/turn-cap.json diff --git a/agents/ai-tfa-coordinator.md b/agents/ai-tfa-coordinator.md index 60b9753..8f5cb4f 100644 --- a/agents/ai-tfa-coordinator.md +++ b/agents/ai-tfa-coordinator.md @@ -139,6 +139,12 @@ re-fetch per test. Never fabricate a PR when the github capability is unavailabl 6. EMIT the RCA_OUTPUT block from the captured terminal state. ``` +> The loop mechanics above have an **executable mirror** in `lib/loop.mjs` +> (`runRcaLoop`) — conformance-tested against recorded `tfaRcaTurn` transcripts +> (`tests/conformance.test.mjs`). It also serves as the **sequential thin-client +> harness** (D5): MCP clients without workflows/subagents drive the same contract +> by calling `runRcaLoop` with a real `submit` bound to `tfaRcaTurn`. + **Sibling confirm (cluster member).** When `pre_seed` is present the first turn states the representative's hypothesis and asks TFA to confirm against this test's own logs. If TFA `RESOLVED`s in one turn → a logs-grounded per-test RCA at diff --git a/lib/loop.mjs b/lib/loop.mjs new file mode 100644 index 0000000..1d1728d --- /dev/null +++ b/lib/loop.mjs @@ -0,0 +1,125 @@ +// Executable mirror of the ai-tfa-coordinator loop (agents/ai-tfa-coordinator.md). +// It drives the collaborative loop against an injected `submit` (real = the +// tfaRcaTurn MCP tool; tests = a recorded-turn replayer), so the loop mechanics — +// status branching, ask routing, gap resolution, turn-cap, one-thread, +// soft-PENDING — are tested rather than assumed. +// +// Double duty: this is ALSO the **sequential thin-client harness** (D5 / ideation +// #4) — the third caller of the same contract, for MCP clients without +// workflows/subagents. Pure + dependency-light (imports only the routing registry). + +import { routeAsks } from "./routing.mjs"; + +function unavailableBlock(gap) { + const what = gap?.ask?.what ?? ""; + return [ + `ASK: ${what}`, + `TYPE: ${gap.evidenceType}`, + `FOUND: no`, + `SUMMARY: unavailable — no ${gap.capability} capability for this client.`, + ].join("\n"); +} + +// runRcaLoop drives one test to a terminal RCA_OUTPUT object. +// +// submit({ testRunId, message, threadId, turnId }) → Promise<turn> (tfaRcaTurn shape) +// gather(routedGatherEntry) → Promise<string> (one digest block) +// resolveGap(routedGapEntry) → Promise<{ digest } | null> (auto: null; interactive: a digest) +export async function runRcaLoop({ + testRunId, + firstMessage = "", + submit, + config = {}, + manifest = {}, + gather = async () => "", + resolveGap = async () => null, + turnCap = config?.turnCap ?? 6, +}) { + if (testRunId == null || Number.isNaN(Number(testRunId))) { + return { + testRunId: String(testRunId), + status: "failed", + root_cause: "no testRunId provided", + turns_used: 0, + asks_fulfilled: [], + asks_skipped: [], + asks_unavailable: [], + }; + } + + let threadId; + let turnId; + let turns = 0; + let message = firstMessage; + const fulfilled = new Set(); + const skipped = new Set(); + const unavailable = new Set(); + + const out = (status, turn, note) => { + const rca = turn?.rca ?? {}; + return { + testRunId: String(testRunId), + status, + confidence: turn?.confidence ?? "unknown", + root_cause: + status === "RESOLVED" + ? (rca.root_cause ?? "") + : status === "BLOCKED" + ? (turn?.reason ?? "") + : (note ?? ""), + possible_fix: rca.possible_fix ?? "", + related_prs: rca.related_prs ?? [], + threadId: threadId ?? null, + turnId: turnId ?? null, + turns_used: turns, + asks_fulfilled: [...fulfilled], + asks_skipped: [...skipped], + asks_unavailable: [...unavailable], + }; + }; + + while (true) { + turns++; + const turn = await submit({ testRunId, message, threadId, turnId }); + threadId = turn.threadId ?? threadId; + + if (turn.status === "RESOLVED") return out("RESOLVED", turn); + if (turn.status === "BLOCKED") return out("BLOCKED", turn); + if (turn.status === "PENDING") { + turnId = turn.turnId ?? turnId; + return out("PENDING", turn, "soft-pending"); + } + + // NEEDS_INFO: route + fulfill. + const buckets = routeAsks(turn.asks ?? [], config, manifest); + const blocks = []; + for (const s of buckets.skip) skipped.add(s.evidenceType); + for (const g of buckets.gather) { + blocks.push(await gather(g)); + fulfilled.add(g.evidenceType); + } + for (const gap of buckets.gap) { + const resolved = await resolveGap(gap); + if (resolved && resolved.digest) { + blocks.push(resolved.digest); + fulfilled.add(gap.evidenceType); + } else { + unavailable.add(gap.evidenceType); + blocks.push(unavailableBlock(gap)); + } + } + + if (turns >= turnCap) return out("PENDING", turn, "turn-cap"); + message = blocks.join("\n\n"); + } +} + +// Replay helper for tests: returns a submit() that yields recorded turns in order. +export function replaySubmit(turns) { + let i = 0; + return async () => { + const turn = turns[Math.min(i, turns.length - 1)]; + i++; + return turn; + }; +} diff --git a/tests/conformance.test.mjs b/tests/conformance.test.mjs new file mode 100644 index 0000000..5ea6192 --- /dev/null +++ b/tests/conformance.test.mjs @@ -0,0 +1,133 @@ +import { test } from "node:test"; +import assert from "node:assert/strict"; +import { readFileSync } from "node:fs"; +import { fileURLToPath } from "node:url"; +import { dirname, join } from "node:path"; +import { runRcaLoop, replaySubmit } from "../lib/loop.mjs"; + +const here = dirname(fileURLToPath(import.meta.url)); +const load = (name) => + JSON.parse(readFileSync(join(here, "fixtures", "recorded-turns", name), "utf8")); + +const CONFIG = { + turnCap: 6, + evidenceRouting: { + test_logs: { owner: "tfa", skip: true }, + product_code: { capability: "github" }, + other: { capability: "other" }, + }, +}; +const GITHUB_AVAILABLE = { github: { available: true, via: "gh" } }; + +// A coordinator gather() stub: returns a one-line digest block. +const gather = async (g) => `ASK: ${g.ask.what}\nTYPE: ${g.evidenceType}\nFOUND: yes\nSUMMARY: stub`; + +test("resolved fixture: NEEDS_INFO → evidence → RESOLVED, rca captured, test_logs skipped", async () => { + const fx = load("resolved.json"); + const result = await runRcaLoop({ + testRunId: fx.testRunId, + firstMessage: "Error: empty buildName", + submit: replaySubmit(fx.turns), + config: CONFIG, + manifest: GITHUB_AVAILABLE, + gather, + }); + assert.equal(result.status, "RESOLVED"); + assert.match(result.root_cause, /#7421/); + assert.deepEqual(result.related_prs, ["#7421"]); + assert.deepEqual(result.asks_fulfilled, ["product_code"]); + assert.deepEqual(result.asks_skipped, ["test_logs"]); // TFA-owned, never gathered + assert.equal(result.turns_used, 2); + assert.equal(result.threadId, "thr-39"); +}); + +test("blocked fixture: terminal with reason captured", async () => { + const fx = load("blocked.json"); + const result = await runRcaLoop({ + testRunId: fx.testRunId, + submit: replaySubmit(fx.turns), + config: CONFIG, + }); + assert.equal(result.status, "BLOCKED"); + assert.match(result.root_cause, /could not obtain server-side logs/); +}); + +test("pending fixture: soft-PENDING ends with turnId, no re-poll", async () => { + const fx = load("pending.json"); + let calls = 0; + const counting = async (args) => { + calls++; + return replaySubmit(fx.turns)(args); + }; + const result = await runRcaLoop({ + testRunId: fx.testRunId, + submit: counting, + config: CONFIG, + }); + assert.equal(result.status, "PENDING"); + assert.equal(result.turnId, "turn-81-1"); + assert.equal(calls, 1); // ended immediately, did not poll again +}); + +test("turn-cap fixture: ends PENDING(turn-cap) at the cap, never a 7th submit", async () => { + const fx = load("turn-cap.json"); + let submits = 0; + const counting = async (args) => { + submits++; + return replaySubmit(fx.turns)(args); + }; + const result = await runRcaLoop({ + testRunId: fx.testRunId, + submit: counting, + config: CONFIG, + manifest: GITHUB_AVAILABLE, + gather, + }); + assert.equal(result.status, "PENDING"); + assert.equal(result.root_cause, "turn-cap"); + assert.equal(submits, 6); // capped at turnCap, never 7 +}); + +test("degraded path: no capability + auto resolveGap → asks_unavailable, still terminal", async () => { + // Same resolved fixture, but the client has NO github capability and runs auto + // (resolveGap returns null → 'unavailable'). The loop must still reach RESOLVED. + const fx = load("resolved.json"); + const result = await runRcaLoop({ + testRunId: fx.testRunId, + submit: replaySubmit(fx.turns), + config: CONFIG, + manifest: {}, // nothing available + resolveGap: async () => null, // auto: report unavailable + }); + assert.equal(result.status, "RESOLVED"); + assert.deepEqual(result.asks_unavailable, ["product_code"]); + assert.deepEqual(result.asks_fulfilled, []); +}); + +test("interactive resolveGap supplies the missing evidence → fulfilled, not unavailable", async () => { + const fx = load("resolved.json"); + const result = await runRcaLoop({ + testRunId: fx.testRunId, + submit: replaySubmit(fx.turns), + config: CONFIG, + manifest: {}, + resolveGap: async () => ({ digest: "ASK: ...\nFOUND: yes\nSUMMARY: user supplied" }), + }); + assert.equal(result.status, "RESOLVED"); + assert.deepEqual(result.asks_fulfilled, ["product_code"]); + assert.deepEqual(result.asks_unavailable, []); +}); + +test("no testRunId → failed block, tool never called", async () => { + let called = false; + const result = await runRcaLoop({ + testRunId: undefined, + submit: async () => { + called = true; + return {}; + }, + config: CONFIG, + }); + assert.equal(result.status, "failed"); + assert.equal(called, false); +}); diff --git a/tests/fixtures/recorded-turns/blocked.json b/tests/fixtures/recorded-turns/blocked.json new file mode 100644 index 0000000..35b5373 --- /dev/null +++ b/tests/fixtures/recorded-turns/blocked.json @@ -0,0 +1,13 @@ +{ + "name": "blocked — unmet asks", + "testRunId": 72, + "turns": [ + { + "status": "BLOCKED", + "confidence": "low", + "threadId": "thr-72", + "reason": "could not obtain server-side logs; cannot distinguish product bug from env flake", + "unmetAsks": ["kibana", "k8s"] + } + ] +} diff --git a/tests/fixtures/recorded-turns/pending.json b/tests/fixtures/recorded-turns/pending.json new file mode 100644 index 0000000..ec8b16a --- /dev/null +++ b/tests/fixtures/recorded-turns/pending.json @@ -0,0 +1,12 @@ +{ + "name": "soft-pending — resumable", + "testRunId": 81, + "turns": [ + { + "status": "PENDING", + "confidence": "unknown", + "threadId": "thr-81", + "turnId": "turn-81-1" + } + ] +} diff --git a/tests/fixtures/recorded-turns/resolved.json b/tests/fixtures/recorded-turns/resolved.json new file mode 100644 index 0000000..120dad6 --- /dev/null +++ b/tests/fixtures/recorded-turns/resolved.json @@ -0,0 +1,37 @@ +{ + "name": "needs_info → evidence → resolved", + "testRunId": 39, + "turns": [ + { + "status": "NEEDS_INFO", + "confidence": "low", + "threadId": "thr-39", + "questions": ["Did the buildName validator change?"], + "asks": [ + { + "what": "Did request-validation on POST /builds change since last green?", + "why": "the failing test posts an empty buildName", + "evidenceType": "product_code", + "priority": "high" + }, + { + "what": "Full run logs for test 39", + "why": "to read the failure", + "evidenceType": "test_logs", + "priority": "high" + } + ] + }, + { + "status": "RESOLVED", + "confidence": "high", + "threadId": "thr-39", + "rca": { + "root_cause": "PR #7421 tightened the buildName validator to reject empty strings", + "possible_fix": "send a non-empty buildName or relax the validator", + "failure_type": "product_regression", + "related_prs": ["#7421"] + } + } + ] +} diff --git a/tests/fixtures/recorded-turns/turn-cap.json b/tests/fixtures/recorded-turns/turn-cap.json new file mode 100644 index 0000000..4638b61 --- /dev/null +++ b/tests/fixtures/recorded-turns/turn-cap.json @@ -0,0 +1,12 @@ +{ + "name": "turn-cap — never resolves", + "testRunId": 99, + "turns": [ + { "status": "NEEDS_INFO", "confidence": "low", "threadId": "thr-99", "asks": [{ "what": "more", "why": "x", "evidenceType": "product_code", "priority": "high" }] }, + { "status": "NEEDS_INFO", "confidence": "low", "threadId": "thr-99", "asks": [{ "what": "more", "why": "x", "evidenceType": "product_code", "priority": "high" }] }, + { "status": "NEEDS_INFO", "confidence": "low", "threadId": "thr-99", "asks": [{ "what": "more", "why": "x", "evidenceType": "product_code", "priority": "high" }] }, + { "status": "NEEDS_INFO", "confidence": "low", "threadId": "thr-99", "asks": [{ "what": "more", "why": "x", "evidenceType": "product_code", "priority": "high" }] }, + { "status": "NEEDS_INFO", "confidence": "low", "threadId": "thr-99", "asks": [{ "what": "more", "why": "x", "evidenceType": "product_code", "priority": "high" }] }, + { "status": "NEEDS_INFO", "confidence": "low", "threadId": "thr-99", "asks": [{ "what": "more", "why": "x", "evidenceType": "product_code", "priority": "high" }] } + ] +} From e9331afce51c173725c1b129a4d3e664f630e20f Mon Sep 17 00:00:00 2001 From: Ruturaj-Browserstack <ruturaj.s@browserstack.com> Date: Tue, 23 Jun 2026 22:20:27 +0530 Subject: [PATCH 11/12] fix(rca): make pending-resume resumable, enforce flip terminal status, skip turn-cap gather MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code-review fixes (suggested, non-blocking): - pending-resume removed from TERMINAL_STATES → soft-PENDING rows are now re-claimable, listed by pendingRows, and skipped by the reaper (they cleared in_flight), so the retained threadId/turnId actually drive an in-session resume instead of being stranded as a permanent non-terminal terminal. - flip() now rejects a missing/non-terminal rca_done without mutating, so a partial flip can't clear the claim yet leave the row pending (duplicate-RCA clobber). - loop checks the turn-cap BEFORE gathering, so evidence on the never-submitted final turn isn't gathered for nothing. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> --- lib/csv-state.mjs | 26 ++++++++++++++++++-------- lib/loop.mjs | 8 ++++++-- tests/csv-state.test.mjs | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 10 deletions(-) diff --git a/lib/csv-state.mjs b/lib/csv-state.mjs index 499f997..ea830ff 100644 --- a/lib/csv-state.mjs +++ b/lib/csv-state.mjs @@ -40,12 +40,13 @@ export const COLUMNS = [ ]; export const PENDING = "pending"; -const TERMINAL_STATES = new Set([ - "resolved", - "blocked", - "failed", - "pending-resume", -]); +export const RESUMABLE = "pending-resume"; +// Truly done — never re-claimed, listed, or reaped. +const TERMINAL_STATES = new Set(["resolved", "blocked", "failed"]); +// Valid outcomes flip() may write. `pending-resume` is a *soft* terminal: this +// attempt ended (claim cleared) but the row stays resumable — it keeps its +// threadId/turnId and is picked back up by the next fan-out / resume pass. +const FLIP_STATES = new Set(["resolved", "blocked", "failed", RESUMABLE]); // ---- minimal RFC4180-ish CSV codec ---------------------------------------- @@ -199,6 +200,11 @@ export function heartbeat(csvPath, testRunId, worker, nowMs) { // possible_fix, related_prs, threadId, turnId, coverage, confidence, // last_evidence_digest, cluster_id. export function flip(csvPath, testRunId, fields, nowMs) { + // Enforce the contract: a flip must name a valid outcome. A partial flip with + // a missing/non-terminal rca_done would otherwise clear the claim yet leave the + // row `pending` — re-exposing it for a duplicate RCA that clobbers this result. + // Reject without mutating so the worker keeps its claim and the bug surfaces. + if (!FLIP_STATES.has(fields?.rca_done)) return false; const rows = readRows(csvPath); const row = rows.find((r) => String(r.testRunId) === String(testRunId)); if (!row) return false; @@ -233,7 +239,11 @@ export function reaper(csvPath, ttlSec, nowMs) { return reclaimed; } -// Rows still needing work (pending or reclaimed). The work-list for fan-out. +// Rows still needing work: fresh/reclaimed `pending` AND `pending-resume` rows +// (soft-PENDING attempts that retain a threadId/turnId to resume). The fan-out +// work-list. Truly terminal rows (resolved/blocked/failed) are excluded. export function pendingRows(csvPath) { - return readRows(csvPath).filter((r) => r.rca_done === PENDING); + return readRows(csvPath).filter( + (r) => r.rca_done === PENDING || r.rca_done === RESUMABLE, + ); } diff --git a/lib/loop.mjs b/lib/loop.mjs index 1d1728d..9e59eb2 100644 --- a/lib/loop.mjs +++ b/lib/loop.mjs @@ -90,7 +90,12 @@ export async function runRcaLoop({ return out("PENDING", turn, "soft-pending"); } - // NEEDS_INFO: route + fulfill. + // NEEDS_INFO. Check the turn-cap BEFORE gathering — evidence assembled on a + // turn we will never submit is wasted work (and a side-effecting gather() + // would run for nothing). + if (turns >= turnCap) return out("PENDING", turn, "turn-cap"); + + // Route + fulfill. const buckets = routeAsks(turn.asks ?? [], config, manifest); const blocks = []; for (const s of buckets.skip) skipped.add(s.evidenceType); @@ -109,7 +114,6 @@ export async function runRcaLoop({ } } - if (turns >= turnCap) return out("PENDING", turn, "turn-cap"); message = blocks.join("\n\n"); } } diff --git a/tests/csv-state.test.mjs b/tests/csv-state.test.mjs index 5a9a60f..ca55d84 100644 --- a/tests/csv-state.test.mjs +++ b/tests/csv-state.test.mjs @@ -120,6 +120,39 @@ test("pendingRows returns only pending work", () => { assert.equal(pend[0].testRunId, "102"); }); +test("flip rejects a missing/non-terminal rca_done without mutating the row", () => { + seed(csv, "build-1", TESTS); + claim(csv, 101, "w1", 1000); + // missing rca_done + assert.equal(flip(csv, 101, { root_cause: "x" }, 2000), false); + // invalid rca_done + assert.equal(flip(csv, 101, { rca_done: "weird" }, 2000), false); + const row = readRows(csv).find((r) => r.testRunId === "101"); + assert.equal(row.rca_done, PENDING); // not reverted to claimable-pending silently + assert.equal(row.in_flight_worker, "w1"); // claim intact — bug surfaces, no clobber + assert.equal(row.root_cause, ""); // nothing written +}); + +test("pending-resume is resumable: not terminal, listed, and re-claimable", () => { + seed(csv, "build-1", TESTS); + claim(csv, 101, "w1", 1000); + flip(csv, 101, { rca_done: "pending-resume", threadId: "thr-1", turnId: "t-1" }, 2000); + const row = readRows(csv).find((r) => r.testRunId === "101"); + assert.equal(row.in_flight_worker, ""); // this attempt released the claim + assert.equal(row.threadId, "thr-1"); // resume handles retained + assert.equal(row.turnId, "t-1"); + // appears in the fan-out work-list and can be claimed by the resume pass + assert.ok(pendingRows(csv).some((r) => r.testRunId === "101")); + assert.equal(claim(csv, 101, "w2", 3000), true); +}); + +test("reaper ignores pending-resume rows (not in flight)", () => { + seed(csv, "build-1", TESTS); + claim(csv, 101, "w1", 1000); + flip(csv, 101, { rca_done: "pending-resume" }, 2000); + assert.deepEqual(reaper(csv, 600, 10_000_000), []); +}); + test("CSV codec round-trips fields with commas, quotes, newlines", () => { seed(csv, "build-1", [{ test_id: 200, test_name: "weird" }]); flip( From 3d22dfe6d9e4cebbc00824749553a43b3c5e38b7 Mon Sep 17 00:00:00 2001 From: Ruturaj-Browserstack <ruturaj.s@browserstack.com> Date: Tue, 23 Jun 2026 22:27:42 +0530 Subject: [PATCH 12/12] chore(rca): gitignore local planning docs Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 9045f9d..4c14fb3 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ node_modules/ .env # Per-run RCA batch state (the CSV/WAL spine + report) is workspace-local. .rca/ +# Planning docs (brainstorm/ideation/plan) stay local — not pushed. +docs/