From ac77154a6aceb1f29245fb6a092f93b537bd8afa Mon Sep 17 00:00:00 2001 From: Michael Dailey Date: Fri, 15 May 2026 09:30:47 -0500 Subject: [PATCH 1/2] PDX-482: feat(mcp): harden testcase tool descriptions for single-call contract MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RCA: PDX-479 surfaced that authoring guidance lives in three places (prompts, resource, tool descriptions) and a regression in any one of them — like the multi-call author-test flow that shipped in PR #153 — can drift the LLM away from correct test case construction. PDX-481 fixed the prompts + resource but the tool descriptions themselves still carried no construct-vs-amend contract. The steps[] field description was just "Ordered list of test steps" with no anti-pattern protection. If a future upstream prompt re-introduces the multi-call pattern, only the tool description can push back at the call site — and it currently says nothing about it. Fix: Added a three-line construction contract to the top of testCaseGenerate.ts TOOL_DESCRIPTION (single-call pattern, step_edit is for AMENDING, stop-and-assemble guidance for the common mistake). Tightened the steps[] field description to call out the FULL/COMPLETE step tree in one call and warn against the multi-call append pattern. Mirrored the contract in testCaseStepTools.ts: the step_edit description now self-identifies as AMENDMENT-ONLY, rejects construction usage, points agents at provar_testcase_generate for new test cases, and spells out the structural defects (dropped scenarios, flat asserts, inconsistent step types) from misuse. Added 6 regression-guard unit tests asserting the canonical phrasing in both tool descriptions. Added scripts/pdx-482-validate.cjs (13 protocol-surface assertions, 13/13 PASS) for direct JSON-RPC verification. Full gate green: 1127 mocha tests, lint clean, compile clean. --- scripts/pdx-482-validate.cjs | 177 ++++++++++++++++++++++++ src/mcp/tools/testCaseGenerate.ts | 18 ++- src/mcp/tools/testCaseStepTools.ts | 11 +- test/unit/mcp/testCaseGenerate.test.ts | 42 ++++++ test/unit/mcp/testCaseStepTools.test.ts | 38 +++++ 5 files changed, 284 insertions(+), 2 deletions(-) create mode 100644 scripts/pdx-482-validate.cjs diff --git a/scripts/pdx-482-validate.cjs b/scripts/pdx-482-validate.cjs new file mode 100644 index 00000000..d63d9668 --- /dev/null +++ b/scripts/pdx-482-validate.cjs @@ -0,0 +1,177 @@ +// PDX-482 validation: confirm the construct/amend contract is reachable at the +// MCP protocol surface. The LLM reads tools/list before every tool call, so +// every assertion here is on bytes the LLM literally sees at the call site. +// +// yarn compile +// node scripts/pdx-482-validate.cjs + +'use strict'; + +const { spawn } = require('child_process'); +const os = require('os'); +const path = require('path'); + +const TMP = os.tmpdir(); +const entry = path.resolve(__dirname, '..', 'bin', 'mcp-start.js'); + +const server = spawn(process.execPath, [entry, 'mcp', 'start', '--allowed-paths', TMP, '--no-update-check'], { + stdio: ['pipe', 'pipe', 'inherit'], +}); + +let nextId = 1; +const pending = new Map(); +let buf = ''; + +server.stdout.on('data', (chunk) => { + buf += chunk.toString('utf-8'); + let nl; + while ((nl = buf.indexOf('\n')) !== -1) { + const line = buf.slice(0, nl).trim(); + buf = buf.slice(nl + 1); + if (!line) continue; + try { + const msg = JSON.parse(line); + const cb = pending.get(msg.id); + if (cb) { + pending.delete(msg.id); + cb(msg); + } + } catch { + /* ignore */ + } + } +}); + +function rpc(method, params) { + const id = nextId++; + const req = JSON.stringify({ jsonrpc: '2.0', id, method, params }) + '\n'; + return new Promise((resolve, reject) => { + pending.set(id, resolve); + setTimeout(() => { + if (pending.has(id)) { + pending.delete(id); + reject(new Error(`Timeout waiting for ${method}`)); + } + }, 10000); + server.stdin.write(req); + }); +} + +const results = []; +function record(label, ok, detail) { + results.push({ label, ok, detail }); +} + +(async () => { + await rpc('initialize', { + protocolVersion: '2024-11-05', + capabilities: {}, + clientInfo: { name: 'pdx-482-validate', version: '1.0.0' }, + }); + + const tools = await rpc('tools/list', {}); + const toolList = tools.result?.tools ?? []; + + // ── provar_testcase_generate tool description ───────────────────────────── + const gen = toolList.find((t) => t.name === 'provar_testcase_generate'); + if (!gen) { + record('provar_testcase_generate is registered', false, 'tool not found'); + } else { + const d = gen.description ?? ''; + record( + 'generate.description leads with "Construction pattern"', + /^[^.]*Construction pattern/.test(d), + d.slice(0, 80) + ); + record( + 'generate.description contains "single call"', + d.includes('single call'), + 'protects against PDX-479 regression at call site' + ); + record( + 'generate.description contains "FULL step tree"', + d.includes('FULL step tree'), + 'instructs full payload in one call' + ); + record( + 'generate.description contains "AMENDING"', + d.includes('AMENDING'), + 'marks step_edit as amendment-only at the generate call site' + ); + record( + 'generate.description rejects CONSTRUCTING via step_edit', + /step_edit[^.]*not for CONSTRUCTING|CONSTRUCTING[^.]*not/i.test(d), + 'explicit rejection of the PDX-479 pattern' + ); + record( + 'generate.description gives stop-and-assemble guidance', + d.includes('stop and assemble') || d.includes('stop, and assemble'), + 'tells agents what to do when they catch themselves in the multi-call pattern' + ); + + const stepsField = gen.inputSchema?.properties?.steps; + const fd = stepsField?.description ?? ''; + record( + 'generate.steps.description contains "COMPLETE step tree"', + fd.includes('COMPLETE step tree'), + 'field-level contract' + ); + record( + 'generate.steps.description contains "single call"', + fd.includes('single call'), + 'field-level single-call reminder' + ); + record( + 'generate.steps.description warns about amendments-only step_edit', + fd.includes('amendments only') || fd.includes('for amendments'), + 'field-level amend-only warning' + ); + } + + // ── provar_testcase_step_edit tool description ─────────────────────────── + const edit = toolList.find((t) => t.name === 'provar_testcase_step_edit'); + if (!edit) { + record('provar_testcase_step_edit is registered', false, 'tool not found'); + } else { + const d = edit.description ?? ''; + record( + 'step_edit.description self-identifies as AMENDMENT-ONLY', + d.includes('AMENDMENT-ONLY') || d.includes('AMENDING'), + 'lead-in framing the LLM reads first' + ); + record( + 'step_edit.description rejects construct-from-scratch usage', + d.includes('NOT for constructing') || d.includes('not for constructing'), + 'explicit rejection at call site' + ); + record( + 'step_edit.description points at provar_testcase_generate for new test cases', + d.includes('provar_testcase_generate'), + 'tells LLM where to go instead' + ); + record( + 'step_edit.description spells out the structural defects from misuse', + d.includes('dropped scenarios') || d.includes('flat asserts') || d.includes('inconsistent step types'), + 'consequence is explicit so the contract is judgement-friendly' + ); + } + + let pass = 0; + let fail = 0; + for (const r of results) { + console.log(`${r.ok ? '[PASS]' : '[FAIL]'} ${r.label} — ${r.detail}`); + if (r.ok) { + pass++; + } else { + fail++; + } + } + console.log(`\nPDX-482 validation: ${pass} passed, ${fail} failed`); + + server.stdin.end(); + process.exit(fail > 0 ? 1 : 0); +})().catch((err) => { + console.error('Validation script error:', err); + server.kill(); + process.exit(2); +}); diff --git a/src/mcp/tools/testCaseGenerate.ts b/src/mcp/tools/testCaseGenerate.ts index c4d2c3ae..0ed8c407 100644 --- a/src/mcp/tools/testCaseGenerate.ts +++ b/src/mcp/tools/testCaseGenerate.ts @@ -118,6 +118,15 @@ const StepSchema = z.object({ }); const TOOL_DESCRIPTION = [ + // ── Construction contract (READ FIRST — PDX-482) ────────────────────────────── + // The PDX-479 regression happened when authoring guidance steered agents toward + // a per-step construction pattern via repeated step_edit calls. These three + // lines make the single-call contract authoritative at the call site so it + // outweighs any conflicting prompt/resource guidance and survives doc drift. + 'Construction pattern: pass the FULL step tree in a single call via the steps[] array.', + 'Do NOT call this tool with an empty steps[] and then append via provar_testcase_step_edit — that pattern drops scenarios, flattens nesting, and produces inconsistent step types.', + 'provar_testcase_step_edit is for AMENDING an existing validated test case (single-step add, attribute fix, debug edit), not for CONSTRUCTING one from scratch. If you find yourself about to call this tool with steps=[] intending to add steps in subsequent tool calls, stop and assemble the full step list first.', + // ── Existing description (unchanged below) ─────────────────────────────────── 'Generate a Provar XML test case skeleton with proper UUID v4 guids, sequential testItemId values, and structure.', 'Returns XML content. Writes to disk only when dry_run=false.', 'Generated structure: with (id is always the integer literal "1" as required by the Provar runtime), a child, then .', @@ -168,7 +177,14 @@ export function registerTestCaseGenerate(server: McpServer, config: ServerConfig steps: z .array(StepSchema) .default([]) - .describe(desc('Ordered list of test steps', 'array, optional; ordered test steps')), + .describe( + desc( + 'Ordered list of test steps. Pass the COMPLETE step tree for the test case in a single call — ' + + 'do not call this tool with an empty array intending to append via provar_testcase_step_edit ' + + '(that pattern is for amendments only and produces structurally invalid test cases when used to construct).', + 'array, optional; FULL ordered step tree in one call' + ) + ), target_uri: z .string() .optional() diff --git a/src/mcp/tools/testCaseStepTools.ts b/src/mcp/tools/testCaseStepTools.ts index 704abe03..c4e5ecc1 100644 --- a/src/mcp/tools/testCaseStepTools.ts +++ b/src/mcp/tools/testCaseStepTools.ts @@ -89,6 +89,15 @@ export function registerTestCaseStepEdit(server: McpServer, config: ServerConfig title: 'Edit Test Case Step', description: desc( [ + // ── Usage contract (READ FIRST — PDX-482) ───────────────────────────── + // This tool AMENDS an existing validated test case. It is NOT for + // constructing a test case from scratch — building one step-by-step via + // repeated step_edit calls produces structurally invalid test cases + // (dropped scenarios, flat asserts, inconsistent step types — see PDX-479). + 'AMENDMENT-ONLY tool: this is for amending an existing, already-validated Provar test case (single-step add, attribute fix, debug edit).', + 'NOT for constructing a test case from scratch — for new test cases use provar_testcase_generate with the FULL steps[] tree in a single call.', + 'Building a test case step-by-step via repeated step_edit calls after a steps=[] generate produces structurally invalid output (dropped scenarios, flat asserts, inconsistent step types).', + // ── Mechanics (unchanged below) ─────────────────────────────────────── 'Add or remove a single step (apiCall) in a Provar XML test case file.', 'Uses write-to-temp-then-rename to minimise partial-write risk.', 'Prerequisites: the test case must exist and be valid XML.', @@ -102,7 +111,7 @@ export function registerTestCaseStepEdit(server: McpServer, config: ServerConfig 'Returns INVALID_XML_AFTER_EDIT (backup restored) when the mutated file fails validation.', 'Grounding for step_xml: call provar_qualityhub_examples_retrieve for corpus examples of the step type you need; if the response has count: 0 with a warning field, fall back: read the provar://docs/step-reference MCP resource.', ].join(' '), - 'Add or remove a single apiCall step in a Provar XML test case file.' + 'AMENDMENT-ONLY: add or remove a single apiCall step in an existing Provar test case (not for constructing new ones).' ), inputSchema: { test_case_path: z diff --git a/test/unit/mcp/testCaseGenerate.test.ts b/test/unit/mcp/testCaseGenerate.test.ts index 31290067..6ed45826 100644 --- a/test/unit/mcp/testCaseGenerate.test.ts +++ b/test/unit/mcp/testCaseGenerate.test.ts @@ -87,6 +87,48 @@ describe('provar_testcase_generate description', () => { 'description should include step-reference fallback' ); }); + + // ── PDX-482 regression guard: construction contract at the call site ────── + // The PDX-479 regression came from upstream guidance steering agents toward + // multi-call construction. These assertions protect the in-tool contract so + // even if upstream prompts/resources regress again, the LLM reads the + // single-call requirement at every call site. + + it('TOOL_DESCRIPTION carries the single-call construction contract', () => { + const reg = server.registrations.find((r) => r.name === 'provar_testcase_generate'); + assert.ok(reg, 'tool should be registered'); + assert.ok( + reg.description.includes('Construction pattern'), + 'description must lead with the construction-pattern contract for PDX-479 protection' + ); + assert.ok( + reg.description.includes('single call'), + 'description must say "single call" so the contract is greppable from the call site' + ); + assert.ok(reg.description.includes('FULL step tree'), 'description must instruct passing the FULL step tree'); + }); + + it('TOOL_DESCRIPTION marks step_edit as AMENDING, not constructing', () => { + const reg = server.registrations.find((r) => r.name === 'provar_testcase_generate'); + assert.ok(reg, 'tool should be registered'); + assert.ok( + reg.description.includes('AMENDING'), + 'description must explicitly say provar_testcase_step_edit is for AMENDING (caps for emphasis at the call site)' + ); + assert.ok( + /step_edit[^.]*not for CONSTRUCTING|CONSTRUCTING[^.]*not/i.test(reg.description), + 'description must explicitly reject CONSTRUCTING via step_edit' + ); + }); + + it('TOOL_DESCRIPTION gives stop-and-assemble guidance for the common mistake', () => { + const reg = server.registrations.find((r) => r.name === 'provar_testcase_generate'); + assert.ok(reg, 'tool should be registered'); + assert.ok( + reg.description.includes('stop and assemble') || reg.description.includes('stop, and assemble'), + 'description must tell agents to stop and assemble the full step list before calling — the most common mistake' + ); + }); }); // ── provar_testcase_generate ─────────────────────────────────────────────────── diff --git a/test/unit/mcp/testCaseStepTools.test.ts b/test/unit/mcp/testCaseStepTools.test.ts index 809cf3cd..d2222f82 100644 --- a/test/unit/mcp/testCaseStepTools.test.ts +++ b/test/unit/mcp/testCaseStepTools.test.ts @@ -96,6 +96,44 @@ describe('provar_testcase_step_edit description', () => { 'description should include step-reference fallback' ); }); + + // ── PDX-482 regression guard: amendment-only contract at the call site ──── + // The PDX-479 regression came from agents using step_edit to build test + // cases from scratch. This contract sits at the call site so the LLM reads + // it every time it considers calling step_edit, surviving any prompt drift. + + it('description self-identifies as AMENDMENT-ONLY at the top', () => { + const reg = server.registrations.find((r) => r.name === 'provar_testcase_step_edit'); + assert.ok(reg, 'tool should be registered'); + assert.ok( + reg.description.includes('AMENDMENT-ONLY') || reg.description.includes('AMENDING'), + 'description must lead with AMENDMENT-ONLY / AMENDING framing so the LLM reads it before mechanics' + ); + }); + + it('description explicitly rejects construction-from-scratch usage', () => { + const reg = server.registrations.find((r) => r.name === 'provar_testcase_step_edit'); + assert.ok(reg, 'tool should be registered'); + assert.ok( + reg.description.includes('NOT for constructing') || reg.description.includes('not for constructing'), + 'description must explicitly say it is NOT for constructing a test case from scratch' + ); + assert.ok( + reg.description.includes('provar_testcase_generate'), + 'description must point the agent at provar_testcase_generate for new test case construction' + ); + }); + + it('description warns about the structural defects from multi-call construction', () => { + const reg = server.registrations.find((r) => r.name === 'provar_testcase_step_edit'); + assert.ok(reg, 'tool should be registered'); + assert.ok( + reg.description.includes('dropped scenarios') || + reg.description.includes('flat asserts') || + reg.description.includes('inconsistent step types'), + 'description must spell out the structural defects (PDX-479) caused by multi-call construction so the LLM understands the consequence' + ); + }); }); // ── provar_testcase_step_edit ────────────────────────────────────────────────── From 47c75e218007670667d5814deff5c4fd22f43eaf Mon Sep 17 00:00:00 2001 From: Michael Dailey Date: Fri, 15 May 2026 09:43:39 -0500 Subject: [PATCH 2/2] PDX-482: fix(mcp): address adversarial review findings on PR #174 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RCA: An adversarial review of the original PDX-482 commit identified three critical defects in the construct/amend contract: (1) PROVAR_MCP_SCHEMA_MODE=compact silently swapped the description for a contract-free one-liner — the LLM would never see the contract in compact mode, making it a regression highway; (2) the regex /step_edit[^.]*not for CONSTRUCTING|CONSTRUCTING[^.]*not/i had a false-positive that could pass on hostile rewordings like "constructing...not via generate"; (3) no test asserted the contract appears EARLY in the description, so a future refactor could move it down where LLM attention is lower. Additionally the step_edit test used an OR-clause across the three structural defects so dropping two of them would silently dilute the warning. Fix: (1) Compact form on provar_testcase_generate now reads "Generate a Provar test case in ONE call with the FULL steps[] tree. Do NOT call with steps=[] then append via provar_testcase_step_edit (step_edit is for AMENDING existing test cases, not for CONSTRUCTING new ones)." — protocol-surface validator now spawns the server twice (standard + PROVAR_MCP_SCHEMA_MODE=compact) and runs 6 contract assertions against the compact form. (2) Replaced the false-positive regex with literal includes('not for CONSTRUCTING one from scratch') in both the unit test and the validator — locked on the canonical phrasing. (3) Added a leading-position assertion in both the unit test and validator: indexOf('Construction pattern') < 200 to prevent silent drift. (4) Tightened the step_edit "structural defects" test from OR-clause to three separate AND-style assertions on "dropped scenarios", "flat asserts", and "inconsistent step types" — dropping any one now fails the test. Gate: 1129 mocha tests, lint clean, validator 20/20 (was 13/13) covering both schema modes. --- scripts/pdx-482-validate.cjs | 209 +++++++++++++++++------- src/mcp/tools/testCaseGenerate.ts | 8 +- test/unit/mcp/testCaseGenerate.test.ts | 67 +++++++- test/unit/mcp/testCaseStepTools.test.ts | 19 ++- 4 files changed, 235 insertions(+), 68 deletions(-) diff --git a/scripts/pdx-482-validate.cjs b/scripts/pdx-482-validate.cjs index d63d9668..5b4e5473 100644 --- a/scripts/pdx-482-validate.cjs +++ b/scripts/pdx-482-validate.cjs @@ -1,6 +1,10 @@ // PDX-482 validation: confirm the construct/amend contract is reachable at the -// MCP protocol surface. The LLM reads tools/list before every tool call, so -// every assertion here is on bytes the LLM literally sees at the call site. +// MCP protocol surface in BOTH standard and compact schema modes. +// +// The LLM reads tools/list before every tool call, so every assertion here is +// on bytes the LLM literally sees at the call site. Compact mode coverage is +// critical because the adversarial review identified that PROVAR_MCP_SCHEMA_MODE=compact +// silently swapped the description for a contract-free one-liner. // // yarn compile // node scripts/pdx-482-validate.cjs @@ -14,65 +18,85 @@ const path = require('path'); const TMP = os.tmpdir(); const entry = path.resolve(__dirname, '..', 'bin', 'mcp-start.js'); -const server = spawn(process.execPath, [entry, 'mcp', 'start', '--allowed-paths', TMP, '--no-update-check'], { - stdio: ['pipe', 'pipe', 'inherit'], -}); +/** + * Spawn an MCP server in the given schema mode and run a set of assertions + * against tools/list. Returns the list of results. + * + * @param {string} mode - human-readable label, e.g. "standard" or "compact" + * @param {Record} extraEnv - env vars to merge into spawn env + * @param {(toolList: Array, record: (label: string, ok: boolean, detail: string) => void) => void} runAssertions + */ +function runValidation(mode, extraEnv, runAssertions) { + return new Promise((resolve, reject) => { + const server = spawn(process.execPath, [entry, 'mcp', 'start', '--allowed-paths', TMP, '--no-update-check'], { + stdio: ['pipe', 'pipe', 'inherit'], + env: { ...process.env, ...extraEnv }, + }); -let nextId = 1; -const pending = new Map(); -let buf = ''; - -server.stdout.on('data', (chunk) => { - buf += chunk.toString('utf-8'); - let nl; - while ((nl = buf.indexOf('\n')) !== -1) { - const line = buf.slice(0, nl).trim(); - buf = buf.slice(nl + 1); - if (!line) continue; - try { - const msg = JSON.parse(line); - const cb = pending.get(msg.id); - if (cb) { - pending.delete(msg.id); - cb(msg); - } - } catch { - /* ignore */ - } - } -}); + let nextId = 1; + const pending = new Map(); + let buf = ''; -function rpc(method, params) { - const id = nextId++; - const req = JSON.stringify({ jsonrpc: '2.0', id, method, params }) + '\n'; - return new Promise((resolve, reject) => { - pending.set(id, resolve); - setTimeout(() => { - if (pending.has(id)) { - pending.delete(id); - reject(new Error(`Timeout waiting for ${method}`)); + server.stdout.on('data', (chunk) => { + buf += chunk.toString('utf-8'); + let nl; + while ((nl = buf.indexOf('\n')) !== -1) { + const line = buf.slice(0, nl).trim(); + buf = buf.slice(nl + 1); + if (!line) continue; + try { + const msg = JSON.parse(line); + const cb = pending.get(msg.id); + if (cb) { + pending.delete(msg.id); + cb(msg); + } + } catch { + /* ignore */ + } } - }, 10000); - server.stdin.write(req); - }); -} + }); -const results = []; -function record(label, ok, detail) { - results.push({ label, ok, detail }); -} + const rpc = (method, params) => { + const id = nextId++; + const req = JSON.stringify({ jsonrpc: '2.0', id, method, params }) + '\n'; + return new Promise((rpcResolve, rpcReject) => { + pending.set(id, rpcResolve); + setTimeout(() => { + if (pending.has(id)) { + pending.delete(id); + rpcReject(new Error(`Timeout waiting for ${method}`)); + } + }, 10000); + server.stdin.write(req); + }); + }; -(async () => { - await rpc('initialize', { - protocolVersion: '2024-11-05', - capabilities: {}, - clientInfo: { name: 'pdx-482-validate', version: '1.0.0' }, - }); + const modeResults = []; + const record = (label, ok, detail) => { + modeResults.push({ label: `[${mode}] ${label}`, ok, detail }); + }; - const tools = await rpc('tools/list', {}); - const toolList = tools.result?.tools ?? []; + (async () => { + await rpc('initialize', { + protocolVersion: '2024-11-05', + capabilities: {}, + clientInfo: { name: 'pdx-482-validate', version: '1.0.0' }, + }); + const tools = await rpc('tools/list', {}); + const toolList = tools.result?.tools ?? []; + runAssertions(toolList, record); + server.stdin.end(); + resolve(modeResults); + })().catch((err) => { + server.kill(); + reject(err); + }); + }); +} - // ── provar_testcase_generate tool description ───────────────────────────── +// ── Assertions for standard mode (full TOOL_DESCRIPTION) ──────────────────── +function standardAssertions(toolList, record) { const gen = toolList.find((t) => t.name === 'provar_testcase_generate'); if (!gen) { record('provar_testcase_generate is registered', false, 'tool not found'); @@ -100,8 +124,17 @@ function record(label, ok, detail) { ); record( 'generate.description rejects CONSTRUCTING via step_edit', - /step_edit[^.]*not for CONSTRUCTING|CONSTRUCTING[^.]*not/i.test(d), - 'explicit rejection of the PDX-479 pattern' + // PDX-482 hardening: literal substring (not regex) — the previous regex + // would false-positive on hostile rewordings like "constructing...not via generate". + d.includes('not for CONSTRUCTING one from scratch'), + 'literal canonical phrase: "not for CONSTRUCTING one from scratch"' + ); + record( + 'generate.description: contract appears in the first 200 chars', + d.indexOf('Construction pattern') >= 0 && d.indexOf('Construction pattern') < 200, + `position: ${d.indexOf( + 'Construction pattern' + )} (LLMs weight leading tokens more; truncating clients cut at ~1024)` ); record( 'generate.description gives stop-and-assemble guidance', @@ -128,7 +161,6 @@ function record(label, ok, detail) { ); } - // ── provar_testcase_step_edit tool description ─────────────────────────── const edit = toolList.find((t) => t.name === 'provar_testcase_step_edit'); if (!edit) { record('provar_testcase_step_edit is registered', false, 'tool not found'); @@ -155,10 +187,68 @@ function record(label, ok, detail) { 'consequence is explicit so the contract is judgement-friendly' ); } +} + +// ── Assertions for compact mode (short one-liner) ─────────────────────────── +// Adversarial review (Critical #1): the compact form must STILL carry the +// contract or PROVAR_MCP_SCHEMA_MODE=compact becomes a regression highway. +function compactAssertions(toolList, record) { + const gen = toolList.find((t) => t.name === 'provar_testcase_generate'); + if (!gen) { + record('provar_testcase_generate is registered', false, 'tool not found'); + } else { + const d = gen.description ?? ''; + record( + 'compact generate.description carries single-call contract', + d.includes('ONE call'), + 'must mention "ONE call" so contract is visible even when the standard form is stripped' + ); + record( + 'compact generate.description carries FULL steps[] tree contract', + d.includes('FULL steps'), + 'must mention FULL steps[] in the compact form' + ); + record( + 'compact generate.description carries AMENDING vs CONSTRUCTING framing', + d.includes('AMENDING') && d.includes('CONSTRUCTING'), + 'must split AMENDING (step_edit) vs CONSTRUCTING (generate) in the compact form' + ); + record( + 'compact generate.description does NOT regress to the pre-PDX-482 contract-free form', + !/^Generate a Provar XML test case skeleton with UUID guids and steps structure\.?$/.test(d), + 'old compact form must be replaced' + ); + } + + const edit = toolList.find((t) => t.name === 'provar_testcase_step_edit'); + if (!edit) { + record('provar_testcase_step_edit is registered', false, 'tool not found'); + } else { + const d = edit.description ?? ''; + record( + 'compact step_edit.description self-identifies as AMENDMENT-ONLY', + d.includes('AMENDMENT-ONLY') || d.includes('amendment') || d.includes('AMENDING'), + 'amendment framing must survive compact mode' + ); + record( + 'compact step_edit.description rejects construct-from-scratch usage', + d.includes('not for constructing') || d.includes('NOT for constructing') || d.includes('not for CONSTRUCTING'), + 'rejection must survive compact mode' + ); + } +} + +(async () => { + const standardResults = await runValidation('standard', {}, standardAssertions); + // Explicitly null out the env var on the standard pass to ensure no leakage. + // For compact, set PROVAR_MCP_SCHEMA_MODE=compact via the spawn env. + const compactResults = await runValidation('compact', { PROVAR_MCP_SCHEMA_MODE: 'compact' }, compactAssertions); + + const allResults = [...standardResults, ...compactResults]; let pass = 0; let fail = 0; - for (const r of results) { + for (const r of allResults) { console.log(`${r.ok ? '[PASS]' : '[FAIL]'} ${r.label} — ${r.detail}`); if (r.ok) { pass++; @@ -167,11 +257,8 @@ function record(label, ok, detail) { } } console.log(`\nPDX-482 validation: ${pass} passed, ${fail} failed`); - - server.stdin.end(); process.exit(fail > 0 ? 1 : 0); })().catch((err) => { console.error('Validation script error:', err); - server.kill(); process.exit(2); }); diff --git a/src/mcp/tools/testCaseGenerate.ts b/src/mcp/tools/testCaseGenerate.ts index 0ed8c407..4d7d64dc 100644 --- a/src/mcp/tools/testCaseGenerate.ts +++ b/src/mcp/tools/testCaseGenerate.ts @@ -170,7 +170,13 @@ export function registerTestCaseGenerate(server: McpServer, config: ServerConfig title: 'Generate Test Case', description: desc( TOOL_DESCRIPTION, - 'Generate a Provar XML test case skeleton with UUID guids and steps structure.' + // PDX-482: the compact form must also carry the construction contract, + // otherwise PROVAR_MCP_SCHEMA_MODE=compact is a regression highway — + // the LLM would see a contract-free one-liner and could fall back to + // the multi-call pattern that caused PDX-479. + 'Generate a Provar test case in ONE call with the FULL steps[] tree. ' + + 'Do NOT call with steps=[] then append via provar_testcase_step_edit ' + + '(step_edit is for AMENDING existing test cases, not for CONSTRUCTING new ones).' ), inputSchema: { test_case_name: z.string().describe(desc('Test case name (human-readable label)', 'string, test case name')), diff --git a/test/unit/mcp/testCaseGenerate.test.ts b/test/unit/mcp/testCaseGenerate.test.ts index 6ed45826..b2049463 100644 --- a/test/unit/mcp/testCaseGenerate.test.ts +++ b/test/unit/mcp/testCaseGenerate.test.ts @@ -115,9 +115,14 @@ describe('provar_testcase_generate description', () => { reg.description.includes('AMENDING'), 'description must explicitly say provar_testcase_step_edit is for AMENDING (caps for emphasis at the call site)' ); + // Use a literal substring match (not a regex) — the previous regex + // /step_edit[^.]*not for CONSTRUCTING|CONSTRUCTING[^.]*not/i had a + // false-positive: the second alternative would pass on hostile text like + // "constructing is the only way... not via generate". Locking on the + // exact canonical phrasing prevents that drift. assert.ok( - /step_edit[^.]*not for CONSTRUCTING|CONSTRUCTING[^.]*not/i.test(reg.description), - 'description must explicitly reject CONSTRUCTING via step_edit' + reg.description.includes('not for CONSTRUCTING one from scratch'), + 'description must explicitly say step_edit is "not for CONSTRUCTING one from scratch" (literal canonical phrase)' ); }); @@ -129,6 +134,64 @@ describe('provar_testcase_generate description', () => { 'description must tell agents to stop and assemble the full step list before calling — the most common mistake' ); }); + + // ── PDX-482 hardening: leading-position assertion (adversarial review fix) ── + // The contract must appear EARLY in the description because LLMs weight + // earlier tokens more heavily and many MCP clients truncate descriptions. + // Without this guard, a future refactor could move the contract to the end + // of the joined array and every other assertion would still pass. + it('Construction contract appears in the first 200 characters of the description', () => { + const reg = server.registrations.find((r) => r.name === 'provar_testcase_generate'); + assert.ok(reg, 'tool should be registered'); + const pos = reg.description.indexOf('Construction pattern'); + assert.ok(pos >= 0, 'description must contain "Construction pattern"'); + assert.ok( + pos < 200, + `"Construction pattern" must appear in the first 200 chars (found at ${pos}) — LLMs weight leading tokens more` + ); + }); + + // ── PDX-482 hardening: compact-mode coverage (adversarial review fix) ────── + // PROVAR_MCP_SCHEMA_MODE=compact swaps the entire description for a short + // one-liner. Without this guard, compact mode is a regression highway: + // the LLM would see a contract-free description and could fall back to the + // multi-call pattern that caused PDX-479. + describe('compact-mode (PROVAR_MCP_SCHEMA_MODE=compact)', () => { + const ORIGINAL_MODE = process.env['PROVAR_MCP_SCHEMA_MODE']; + let compactServer: MockMcpServer; + + beforeEach(() => { + process.env['PROVAR_MCP_SCHEMA_MODE'] = 'compact'; + compactServer = new MockMcpServer(); + registerTestCaseGenerate(compactServer as never, { allowedPaths: [tmpDir] }); + }); + + afterEach(() => { + if (ORIGINAL_MODE === undefined) { + delete process.env['PROVAR_MCP_SCHEMA_MODE']; + } else { + process.env['PROVAR_MCP_SCHEMA_MODE'] = ORIGINAL_MODE; + } + }); + + it('compact description still carries the single-call construction contract', () => { + const reg = compactServer.registrations.find((r) => r.name === 'provar_testcase_generate'); + assert.ok(reg, 'tool should be registered in compact mode'); + assert.ok( + reg.description.includes('ONE call'), + 'compact description must say "ONE call" — otherwise compact mode silently strips the contract (PDX-479 regression highway)' + ); + assert.ok(reg.description.includes('FULL steps'), 'compact description must mention the FULL steps[] tree'); + assert.ok( + reg.description.includes('AMENDING') || reg.description.includes('amend'), + 'compact description must mark step_edit as amendment-only' + ); + assert.ok( + !reg.description.includes('UUID guids and steps structure'), + 'old compact form (contract-free) must not be in use anymore' + ); + }); + }); }); // ── provar_testcase_generate ─────────────────────────────────────────────────── diff --git a/test/unit/mcp/testCaseStepTools.test.ts b/test/unit/mcp/testCaseStepTools.test.ts index d2222f82..17ca038b 100644 --- a/test/unit/mcp/testCaseStepTools.test.ts +++ b/test/unit/mcp/testCaseStepTools.test.ts @@ -127,11 +127,22 @@ describe('provar_testcase_step_edit description', () => { it('description warns about the structural defects from multi-call construction', () => { const reg = server.registrations.find((r) => r.name === 'provar_testcase_step_edit'); assert.ok(reg, 'tool should be registered'); + // Adversarial review (PDX-482 hardening): require ALL three defects, not + // just one. An OR-clause would allow silent dilution where a future cleanup + // removes two of the three defects but leaves one and the test still passes. + // Listing the full consequence chain is what gives the LLM the "why" needed + // to apply judgement when guidance is ambiguous. assert.ok( - reg.description.includes('dropped scenarios') || - reg.description.includes('flat asserts') || - reg.description.includes('inconsistent step types'), - 'description must spell out the structural defects (PDX-479) caused by multi-call construction so the LLM understands the consequence' + reg.description.includes('dropped scenarios'), + 'description must call out "dropped scenarios" (the symptom that first surfaced PDX-479)' + ); + assert.ok( + reg.description.includes('flat asserts'), + 'description must call out "flat asserts" (the second observable defect)' + ); + assert.ok( + reg.description.includes('inconsistent step types'), + 'description must call out "inconsistent step types" (the third observable defect)' ); }); });