diff --git a/scripts/pdx-482-validate.cjs b/scripts/pdx-482-validate.cjs new file mode 100644 index 00000000..5b4e5473 --- /dev/null +++ b/scripts/pdx-482-validate.cjs @@ -0,0 +1,264 @@ +// PDX-482 validation: confirm the construct/amend contract is reachable at the +// MCP protocol surface in BOTH standard and compact schema modes. +// +// The LLM reads tools/list before every tool call, so every assertion here is +// on bytes the LLM literally sees at the call site. Compact mode coverage is +// critical because the adversarial review identified that PROVAR_MCP_SCHEMA_MODE=compact +// silently swapped the description for a contract-free one-liner. +// +// yarn compile +// node scripts/pdx-482-validate.cjs + +'use strict'; + +const { spawn } = require('child_process'); +const os = require('os'); +const path = require('path'); + +const TMP = os.tmpdir(); +const entry = path.resolve(__dirname, '..', 'bin', 'mcp-start.js'); + +/** + * Spawn an MCP server in the given schema mode and run a set of assertions + * against tools/list. Returns the list of results. + * + * @param {string} mode - human-readable label, e.g. "standard" or "compact" + * @param {Record} extraEnv - env vars to merge into spawn env + * @param {(toolList: Array, record: (label: string, ok: boolean, detail: string) => void) => void} runAssertions + */ +function runValidation(mode, extraEnv, runAssertions) { + return new Promise((resolve, reject) => { + const server = spawn(process.execPath, [entry, 'mcp', 'start', '--allowed-paths', TMP, '--no-update-check'], { + stdio: ['pipe', 'pipe', 'inherit'], + env: { ...process.env, ...extraEnv }, + }); + + let nextId = 1; + const pending = new Map(); + let buf = ''; + + server.stdout.on('data', (chunk) => { + buf += chunk.toString('utf-8'); + let nl; + while ((nl = buf.indexOf('\n')) !== -1) { + const line = buf.slice(0, nl).trim(); + buf = buf.slice(nl + 1); + if (!line) continue; + try { + const msg = JSON.parse(line); + const cb = pending.get(msg.id); + if (cb) { + pending.delete(msg.id); + cb(msg); + } + } catch { + /* ignore */ + } + } + }); + + const rpc = (method, params) => { + const id = nextId++; + const req = JSON.stringify({ jsonrpc: '2.0', id, method, params }) + '\n'; + return new Promise((rpcResolve, rpcReject) => { + pending.set(id, rpcResolve); + setTimeout(() => { + if (pending.has(id)) { + pending.delete(id); + rpcReject(new Error(`Timeout waiting for ${method}`)); + } + }, 10000); + server.stdin.write(req); + }); + }; + + const modeResults = []; + const record = (label, ok, detail) => { + modeResults.push({ label: `[${mode}] ${label}`, ok, detail }); + }; + + (async () => { + await rpc('initialize', { + protocolVersion: '2024-11-05', + capabilities: {}, + clientInfo: { name: 'pdx-482-validate', version: '1.0.0' }, + }); + const tools = await rpc('tools/list', {}); + const toolList = tools.result?.tools ?? []; + runAssertions(toolList, record); + server.stdin.end(); + resolve(modeResults); + })().catch((err) => { + server.kill(); + reject(err); + }); + }); +} + +// ── Assertions for standard mode (full TOOL_DESCRIPTION) ──────────────────── +function standardAssertions(toolList, record) { + const gen = toolList.find((t) => t.name === 'provar_testcase_generate'); + if (!gen) { + record('provar_testcase_generate is registered', false, 'tool not found'); + } else { + const d = gen.description ?? ''; + record( + 'generate.description leads with "Construction pattern"', + /^[^.]*Construction pattern/.test(d), + d.slice(0, 80) + ); + record( + 'generate.description contains "single call"', + d.includes('single call'), + 'protects against PDX-479 regression at call site' + ); + record( + 'generate.description contains "FULL step tree"', + d.includes('FULL step tree'), + 'instructs full payload in one call' + ); + record( + 'generate.description contains "AMENDING"', + d.includes('AMENDING'), + 'marks step_edit as amendment-only at the generate call site' + ); + record( + 'generate.description rejects CONSTRUCTING via step_edit', + // PDX-482 hardening: literal substring (not regex) — the previous regex + // would false-positive on hostile rewordings like "constructing...not via generate". + d.includes('not for CONSTRUCTING one from scratch'), + 'literal canonical phrase: "not for CONSTRUCTING one from scratch"' + ); + record( + 'generate.description: contract appears in the first 200 chars', + d.indexOf('Construction pattern') >= 0 && d.indexOf('Construction pattern') < 200, + `position: ${d.indexOf( + 'Construction pattern' + )} (LLMs weight leading tokens more; truncating clients cut at ~1024)` + ); + record( + 'generate.description gives stop-and-assemble guidance', + d.includes('stop and assemble') || d.includes('stop, and assemble'), + 'tells agents what to do when they catch themselves in the multi-call pattern' + ); + + const stepsField = gen.inputSchema?.properties?.steps; + const fd = stepsField?.description ?? ''; + record( + 'generate.steps.description contains "COMPLETE step tree"', + fd.includes('COMPLETE step tree'), + 'field-level contract' + ); + record( + 'generate.steps.description contains "single call"', + fd.includes('single call'), + 'field-level single-call reminder' + ); + record( + 'generate.steps.description warns about amendments-only step_edit', + fd.includes('amendments only') || fd.includes('for amendments'), + 'field-level amend-only warning' + ); + } + + const edit = toolList.find((t) => t.name === 'provar_testcase_step_edit'); + if (!edit) { + record('provar_testcase_step_edit is registered', false, 'tool not found'); + } else { + const d = edit.description ?? ''; + record( + 'step_edit.description self-identifies as AMENDMENT-ONLY', + d.includes('AMENDMENT-ONLY') || d.includes('AMENDING'), + 'lead-in framing the LLM reads first' + ); + record( + 'step_edit.description rejects construct-from-scratch usage', + d.includes('NOT for constructing') || d.includes('not for constructing'), + 'explicit rejection at call site' + ); + record( + 'step_edit.description points at provar_testcase_generate for new test cases', + d.includes('provar_testcase_generate'), + 'tells LLM where to go instead' + ); + record( + 'step_edit.description spells out the structural defects from misuse', + d.includes('dropped scenarios') || d.includes('flat asserts') || d.includes('inconsistent step types'), + 'consequence is explicit so the contract is judgement-friendly' + ); + } +} + +// ── Assertions for compact mode (short one-liner) ─────────────────────────── +// Adversarial review (Critical #1): the compact form must STILL carry the +// contract or PROVAR_MCP_SCHEMA_MODE=compact becomes a regression highway. +function compactAssertions(toolList, record) { + const gen = toolList.find((t) => t.name === 'provar_testcase_generate'); + if (!gen) { + record('provar_testcase_generate is registered', false, 'tool not found'); + } else { + const d = gen.description ?? ''; + record( + 'compact generate.description carries single-call contract', + d.includes('ONE call'), + 'must mention "ONE call" so contract is visible even when the standard form is stripped' + ); + record( + 'compact generate.description carries FULL steps[] tree contract', + d.includes('FULL steps'), + 'must mention FULL steps[] in the compact form' + ); + record( + 'compact generate.description carries AMENDING vs CONSTRUCTING framing', + d.includes('AMENDING') && d.includes('CONSTRUCTING'), + 'must split AMENDING (step_edit) vs CONSTRUCTING (generate) in the compact form' + ); + record( + 'compact generate.description does NOT regress to the pre-PDX-482 contract-free form', + !/^Generate a Provar XML test case skeleton with UUID guids and steps structure\.?$/.test(d), + 'old compact form must be replaced' + ); + } + + const edit = toolList.find((t) => t.name === 'provar_testcase_step_edit'); + if (!edit) { + record('provar_testcase_step_edit is registered', false, 'tool not found'); + } else { + const d = edit.description ?? ''; + record( + 'compact step_edit.description self-identifies as AMENDMENT-ONLY', + d.includes('AMENDMENT-ONLY') || d.includes('amendment') || d.includes('AMENDING'), + 'amendment framing must survive compact mode' + ); + record( + 'compact step_edit.description rejects construct-from-scratch usage', + d.includes('not for constructing') || d.includes('NOT for constructing') || d.includes('not for CONSTRUCTING'), + 'rejection must survive compact mode' + ); + } +} + +(async () => { + const standardResults = await runValidation('standard', {}, standardAssertions); + // Explicitly null out the env var on the standard pass to ensure no leakage. + // For compact, set PROVAR_MCP_SCHEMA_MODE=compact via the spawn env. + const compactResults = await runValidation('compact', { PROVAR_MCP_SCHEMA_MODE: 'compact' }, compactAssertions); + + const allResults = [...standardResults, ...compactResults]; + + let pass = 0; + let fail = 0; + for (const r of allResults) { + console.log(`${r.ok ? '[PASS]' : '[FAIL]'} ${r.label} — ${r.detail}`); + if (r.ok) { + pass++; + } else { + fail++; + } + } + console.log(`\nPDX-482 validation: ${pass} passed, ${fail} failed`); + process.exit(fail > 0 ? 1 : 0); +})().catch((err) => { + console.error('Validation script error:', err); + process.exit(2); +}); diff --git a/src/mcp/tools/testCaseGenerate.ts b/src/mcp/tools/testCaseGenerate.ts index c4d2c3ae..4d7d64dc 100644 --- a/src/mcp/tools/testCaseGenerate.ts +++ b/src/mcp/tools/testCaseGenerate.ts @@ -118,6 +118,15 @@ const StepSchema = z.object({ }); const TOOL_DESCRIPTION = [ + // ── Construction contract (READ FIRST — PDX-482) ────────────────────────────── + // The PDX-479 regression happened when authoring guidance steered agents toward + // a per-step construction pattern via repeated step_edit calls. These three + // lines make the single-call contract authoritative at the call site so it + // outweighs any conflicting prompt/resource guidance and survives doc drift. + 'Construction pattern: pass the FULL step tree in a single call via the steps[] array.', + 'Do NOT call this tool with an empty steps[] and then append via provar_testcase_step_edit — that pattern drops scenarios, flattens nesting, and produces inconsistent step types.', + 'provar_testcase_step_edit is for AMENDING an existing validated test case (single-step add, attribute fix, debug edit), not for CONSTRUCTING one from scratch. If you find yourself about to call this tool with steps=[] intending to add steps in subsequent tool calls, stop and assemble the full step list first.', + // ── Existing description (unchanged below) ─────────────────────────────────── 'Generate a Provar XML test case skeleton with proper UUID v4 guids, sequential testItemId values, and structure.', 'Returns XML content. Writes to disk only when dry_run=false.', 'Generated structure: with (id is always the integer literal "1" as required by the Provar runtime), a child, then .', @@ -161,14 +170,27 @@ export function registerTestCaseGenerate(server: McpServer, config: ServerConfig title: 'Generate Test Case', description: desc( TOOL_DESCRIPTION, - 'Generate a Provar XML test case skeleton with UUID guids and steps structure.' + // PDX-482: the compact form must also carry the construction contract, + // otherwise PROVAR_MCP_SCHEMA_MODE=compact is a regression highway — + // the LLM would see a contract-free one-liner and could fall back to + // the multi-call pattern that caused PDX-479. + 'Generate a Provar test case in ONE call with the FULL steps[] tree. ' + + 'Do NOT call with steps=[] then append via provar_testcase_step_edit ' + + '(step_edit is for AMENDING existing test cases, not for CONSTRUCTING new ones).' ), inputSchema: { test_case_name: z.string().describe(desc('Test case name (human-readable label)', 'string, test case name')), steps: z .array(StepSchema) .default([]) - .describe(desc('Ordered list of test steps', 'array, optional; ordered test steps')), + .describe( + desc( + 'Ordered list of test steps. Pass the COMPLETE step tree for the test case in a single call — ' + + 'do not call this tool with an empty array intending to append via provar_testcase_step_edit ' + + '(that pattern is for amendments only and produces structurally invalid test cases when used to construct).', + 'array, optional; FULL ordered step tree in one call' + ) + ), target_uri: z .string() .optional() diff --git a/src/mcp/tools/testCaseStepTools.ts b/src/mcp/tools/testCaseStepTools.ts index 704abe03..c4e5ecc1 100644 --- a/src/mcp/tools/testCaseStepTools.ts +++ b/src/mcp/tools/testCaseStepTools.ts @@ -89,6 +89,15 @@ export function registerTestCaseStepEdit(server: McpServer, config: ServerConfig title: 'Edit Test Case Step', description: desc( [ + // ── Usage contract (READ FIRST — PDX-482) ───────────────────────────── + // This tool AMENDS an existing validated test case. It is NOT for + // constructing a test case from scratch — building one step-by-step via + // repeated step_edit calls produces structurally invalid test cases + // (dropped scenarios, flat asserts, inconsistent step types — see PDX-479). + 'AMENDMENT-ONLY tool: this is for amending an existing, already-validated Provar test case (single-step add, attribute fix, debug edit).', + 'NOT for constructing a test case from scratch — for new test cases use provar_testcase_generate with the FULL steps[] tree in a single call.', + 'Building a test case step-by-step via repeated step_edit calls after a steps=[] generate produces structurally invalid output (dropped scenarios, flat asserts, inconsistent step types).', + // ── Mechanics (unchanged below) ─────────────────────────────────────── 'Add or remove a single step (apiCall) in a Provar XML test case file.', 'Uses write-to-temp-then-rename to minimise partial-write risk.', 'Prerequisites: the test case must exist and be valid XML.', @@ -102,7 +111,7 @@ export function registerTestCaseStepEdit(server: McpServer, config: ServerConfig 'Returns INVALID_XML_AFTER_EDIT (backup restored) when the mutated file fails validation.', 'Grounding for step_xml: call provar_qualityhub_examples_retrieve for corpus examples of the step type you need; if the response has count: 0 with a warning field, fall back: read the provar://docs/step-reference MCP resource.', ].join(' '), - 'Add or remove a single apiCall step in a Provar XML test case file.' + 'AMENDMENT-ONLY: add or remove a single apiCall step in an existing Provar test case (not for constructing new ones).' ), inputSchema: { test_case_path: z diff --git a/test/unit/mcp/testCaseGenerate.test.ts b/test/unit/mcp/testCaseGenerate.test.ts index 31290067..b2049463 100644 --- a/test/unit/mcp/testCaseGenerate.test.ts +++ b/test/unit/mcp/testCaseGenerate.test.ts @@ -87,6 +87,111 @@ describe('provar_testcase_generate description', () => { 'description should include step-reference fallback' ); }); + + // ── PDX-482 regression guard: construction contract at the call site ────── + // The PDX-479 regression came from upstream guidance steering agents toward + // multi-call construction. These assertions protect the in-tool contract so + // even if upstream prompts/resources regress again, the LLM reads the + // single-call requirement at every call site. + + it('TOOL_DESCRIPTION carries the single-call construction contract', () => { + const reg = server.registrations.find((r) => r.name === 'provar_testcase_generate'); + assert.ok(reg, 'tool should be registered'); + assert.ok( + reg.description.includes('Construction pattern'), + 'description must lead with the construction-pattern contract for PDX-479 protection' + ); + assert.ok( + reg.description.includes('single call'), + 'description must say "single call" so the contract is greppable from the call site' + ); + assert.ok(reg.description.includes('FULL step tree'), 'description must instruct passing the FULL step tree'); + }); + + it('TOOL_DESCRIPTION marks step_edit as AMENDING, not constructing', () => { + const reg = server.registrations.find((r) => r.name === 'provar_testcase_generate'); + assert.ok(reg, 'tool should be registered'); + assert.ok( + reg.description.includes('AMENDING'), + 'description must explicitly say provar_testcase_step_edit is for AMENDING (caps for emphasis at the call site)' + ); + // Use a literal substring match (not a regex) — the previous regex + // /step_edit[^.]*not for CONSTRUCTING|CONSTRUCTING[^.]*not/i had a + // false-positive: the second alternative would pass on hostile text like + // "constructing is the only way... not via generate". Locking on the + // exact canonical phrasing prevents that drift. + assert.ok( + reg.description.includes('not for CONSTRUCTING one from scratch'), + 'description must explicitly say step_edit is "not for CONSTRUCTING one from scratch" (literal canonical phrase)' + ); + }); + + it('TOOL_DESCRIPTION gives stop-and-assemble guidance for the common mistake', () => { + const reg = server.registrations.find((r) => r.name === 'provar_testcase_generate'); + assert.ok(reg, 'tool should be registered'); + assert.ok( + reg.description.includes('stop and assemble') || reg.description.includes('stop, and assemble'), + 'description must tell agents to stop and assemble the full step list before calling — the most common mistake' + ); + }); + + // ── PDX-482 hardening: leading-position assertion (adversarial review fix) ── + // The contract must appear EARLY in the description because LLMs weight + // earlier tokens more heavily and many MCP clients truncate descriptions. + // Without this guard, a future refactor could move the contract to the end + // of the joined array and every other assertion would still pass. + it('Construction contract appears in the first 200 characters of the description', () => { + const reg = server.registrations.find((r) => r.name === 'provar_testcase_generate'); + assert.ok(reg, 'tool should be registered'); + const pos = reg.description.indexOf('Construction pattern'); + assert.ok(pos >= 0, 'description must contain "Construction pattern"'); + assert.ok( + pos < 200, + `"Construction pattern" must appear in the first 200 chars (found at ${pos}) — LLMs weight leading tokens more` + ); + }); + + // ── PDX-482 hardening: compact-mode coverage (adversarial review fix) ────── + // PROVAR_MCP_SCHEMA_MODE=compact swaps the entire description for a short + // one-liner. Without this guard, compact mode is a regression highway: + // the LLM would see a contract-free description and could fall back to the + // multi-call pattern that caused PDX-479. + describe('compact-mode (PROVAR_MCP_SCHEMA_MODE=compact)', () => { + const ORIGINAL_MODE = process.env['PROVAR_MCP_SCHEMA_MODE']; + let compactServer: MockMcpServer; + + beforeEach(() => { + process.env['PROVAR_MCP_SCHEMA_MODE'] = 'compact'; + compactServer = new MockMcpServer(); + registerTestCaseGenerate(compactServer as never, { allowedPaths: [tmpDir] }); + }); + + afterEach(() => { + if (ORIGINAL_MODE === undefined) { + delete process.env['PROVAR_MCP_SCHEMA_MODE']; + } else { + process.env['PROVAR_MCP_SCHEMA_MODE'] = ORIGINAL_MODE; + } + }); + + it('compact description still carries the single-call construction contract', () => { + const reg = compactServer.registrations.find((r) => r.name === 'provar_testcase_generate'); + assert.ok(reg, 'tool should be registered in compact mode'); + assert.ok( + reg.description.includes('ONE call'), + 'compact description must say "ONE call" — otherwise compact mode silently strips the contract (PDX-479 regression highway)' + ); + assert.ok(reg.description.includes('FULL steps'), 'compact description must mention the FULL steps[] tree'); + assert.ok( + reg.description.includes('AMENDING') || reg.description.includes('amend'), + 'compact description must mark step_edit as amendment-only' + ); + assert.ok( + !reg.description.includes('UUID guids and steps structure'), + 'old compact form (contract-free) must not be in use anymore' + ); + }); + }); }); // ── provar_testcase_generate ─────────────────────────────────────────────────── diff --git a/test/unit/mcp/testCaseStepTools.test.ts b/test/unit/mcp/testCaseStepTools.test.ts index 809cf3cd..17ca038b 100644 --- a/test/unit/mcp/testCaseStepTools.test.ts +++ b/test/unit/mcp/testCaseStepTools.test.ts @@ -96,6 +96,55 @@ describe('provar_testcase_step_edit description', () => { 'description should include step-reference fallback' ); }); + + // ── PDX-482 regression guard: amendment-only contract at the call site ──── + // The PDX-479 regression came from agents using step_edit to build test + // cases from scratch. This contract sits at the call site so the LLM reads + // it every time it considers calling step_edit, surviving any prompt drift. + + it('description self-identifies as AMENDMENT-ONLY at the top', () => { + const reg = server.registrations.find((r) => r.name === 'provar_testcase_step_edit'); + assert.ok(reg, 'tool should be registered'); + assert.ok( + reg.description.includes('AMENDMENT-ONLY') || reg.description.includes('AMENDING'), + 'description must lead with AMENDMENT-ONLY / AMENDING framing so the LLM reads it before mechanics' + ); + }); + + it('description explicitly rejects construction-from-scratch usage', () => { + const reg = server.registrations.find((r) => r.name === 'provar_testcase_step_edit'); + assert.ok(reg, 'tool should be registered'); + assert.ok( + reg.description.includes('NOT for constructing') || reg.description.includes('not for constructing'), + 'description must explicitly say it is NOT for constructing a test case from scratch' + ); + assert.ok( + reg.description.includes('provar_testcase_generate'), + 'description must point the agent at provar_testcase_generate for new test case construction' + ); + }); + + it('description warns about the structural defects from multi-call construction', () => { + const reg = server.registrations.find((r) => r.name === 'provar_testcase_step_edit'); + assert.ok(reg, 'tool should be registered'); + // Adversarial review (PDX-482 hardening): require ALL three defects, not + // just one. An OR-clause would allow silent dilution where a future cleanup + // removes two of the three defects but leaves one and the test still passes. + // Listing the full consequence chain is what gives the LLM the "why" needed + // to apply judgement when guidance is ambiguous. + assert.ok( + reg.description.includes('dropped scenarios'), + 'description must call out "dropped scenarios" (the symptom that first surfaced PDX-479)' + ); + assert.ok( + reg.description.includes('flat asserts'), + 'description must call out "flat asserts" (the second observable defect)' + ); + assert.ok( + reg.description.includes('inconsistent step types'), + 'description must call out "inconsistent step types" (the third observable defect)' + ); + }); }); // ── provar_testcase_step_edit ──────────────────────────────────────────────────