diff --git a/apps/cli/src/commands/results/remote.ts b/apps/cli/src/commands/results/remote.ts index 2fcc4a7e..59c0af1e 100644 --- a/apps/cli/src/commands/results/remote.ts +++ b/apps/cli/src/commands/results/remote.ts @@ -8,7 +8,9 @@ import { directPushResults, directorySizeBytes, getResultsRepoStatus, + listGitRuns, loadConfig, + normalizeResultsConfig, resolveResultsRepoRunsDir, syncResultsRepo, } from '@agentv/core'; @@ -59,15 +61,6 @@ function getStatusMessage(error: unknown): string { return error instanceof Error ? error.message : String(error); } -function normalizeResultsConfig(config: ResultsConfig): Required { - return { - repo: config.repo, - path: config.path, - auto_push: config.auto_push === true, - branch_prefix: config.branch_prefix?.trim() || 'eval-results', - }; -} - function statusForResult(result: EvaluationResult): 'PASS' | 'FAIL' | 'ERROR' { if (result.executionStatus === 'execution_error' || result.error) { return 'ERROR'; @@ -185,15 +178,45 @@ export async function listMergedResultFiles( }; } - const remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map( - (meta) => - ({ - ...meta, - filename: encodeRemoteRunId(meta.filename), - raw_filename: meta.filename, + let remoteRuns: SourcedResultFileMeta[] = []; + if (config.mode === 'github') { + try { + const gitRuns = await listGitRuns(config.path); + remoteRuns = gitRuns.map((r) => ({ + filename: encodeRemoteRunId(r.run_id), + raw_filename: r.run_id, source: 'remote' as const, - }) satisfies SourcedResultFileMeta, - ); + path: path.join(config.path, r.manifest_path), + displayName: r.display_name, + timestamp: r.timestamp, + testCount: r.test_count, + passRate: r.pass_rate || 0, + avgScore: r.avg_score || 0, + sizeBytes: r.size_bytes || 0, + })); + } catch (error) { + console.error('git-native listing failed, falling back', error); + remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map( + (meta) => + ({ + ...meta, + filename: encodeRemoteRunId(meta.filename), + raw_filename: meta.filename, + source: 'remote' as const, + }) satisfies SourcedResultFileMeta, + ); + } + } else { + remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map( + (meta) => + ({ + ...meta, + filename: encodeRemoteRunId(meta.filename), + raw_filename: meta.filename, + source: 'remote' as const, + }) satisfies SourcedResultFileMeta, + ); + } const merged = [...localRuns, ...remoteRuns].sort((a, b) => b.timestamp.localeCompare(a.timestamp), diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 79ca87fc..5d94a45c 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -274,49 +274,103 @@ function inferExperimentFromRunId(runId: string): string | undefined { return experiment; } +const DEFAULT_RUN_PAGE_LIMIT = 50; + +function parseRunPageLimit(limitParam: string | undefined): number | undefined | null { + if (limitParam === undefined) { + return undefined; + } + if (!/^\d+$/.test(limitParam)) { + return null; + } + const limit = Number.parseInt(limitParam, 10); + return limit > 0 ? limit : null; +} + +function paginateRuns( + runs: T[], + cursor: string | undefined, + limit: number | undefined, +): { runs: T[]; nextCursor?: string } { + if (limit === undefined) { + return { runs }; + } + + if (!cursor) { + const page = runs.slice(0, limit); + return { + runs: page, + ...(limit < runs.length && page.length > 0 ? { nextCursor: page.at(-1)?.filename } : {}), + }; + } + + const cursorIndex = runs.findIndex((run) => run.filename === cursor); + if (cursorIndex === -1) { + return { runs: [] }; + } + + const page = runs.slice(cursorIndex + 1, cursorIndex + 1 + limit); + return { + runs: page, + ...(cursorIndex + 1 + limit < runs.length && page.length > 0 + ? { nextCursor: page.at(-1)?.filename } + : {}), + }; +} + async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) { const { runs: metas } = await listMergedResultFiles(searchDir); const { threshold: passThreshold } = loadStudioConfig(agentvDir); - return c.json({ - runs: metas.map((m) => { - let target: string | undefined; - let experiment = inferExperimentFromRunId(m.raw_filename); - let passRate = m.passRate; - try { - const records = loadLightweightResults(m.path); - if (records.length > 0) { - target = records[0].target; - experiment = records[0].experiment ?? experiment; - passRate = records.filter((r) => r.score >= passThreshold).length / records.length; - } else { - // Run is in-progress with 0 results written yet — fall back to the - // in-memory target stored when the Studio launched this run. - target = getActiveRunTarget(m.path); - } - } catch { - // ignore enrichment errors + const parsedLimit = parseRunPageLimit(c.req.query('limit')); + if (parsedLimit === null) { + return c.json({ error: 'limit must be a positive integer' }, 400); + } + + const cursor = c.req.query('cursor'); + const limit = parsedLimit ?? (cursor ? DEFAULT_RUN_PAGE_LIMIT : undefined); + const runs = metas.map((m) => { + let target: string | undefined; + let experiment = inferExperimentFromRunId(m.raw_filename); + let passRate = m.passRate; + try { + const records = loadLightweightResults(m.path); + if (records.length > 0) { + target = records[0].target; + experiment = records[0].experiment ?? experiment; + passRate = records.filter((r) => r.score >= passThreshold).length / records.length; + } else { + // Run is in-progress with 0 results written yet — fall back to the + // in-memory target stored when the Studio launched this run. + target = getActiveRunTarget(m.path); } - // Surface live status for Studio-launched runs that are still starting - // or running so the RunList can render a spinner instead of the - // pass/fail dot derived from a 0% pass rate. - const liveStatus = getActiveRunStatus(m.path); - const tagsEntry = readRunTags(m.path); - return { - filename: m.filename, - display_name: m.displayName, - path: m.path, - timestamp: m.timestamp, - test_count: m.testCount, - pass_rate: passRate, - avg_score: m.avgScore, - size_bytes: m.sizeBytes, - source: m.source, - ...(target && { target }), - ...(experiment && { experiment }), - ...(tagsEntry && { tags: tagsEntry.tags }), - ...(liveStatus && { status: liveStatus }), - }; - }), + } catch { + // ignore enrichment errors + } + // Surface live status for Studio-launched runs that are still starting + // or running so the RunList can render a spinner instead of the + // pass/fail dot derived from a 0% pass rate. + const liveStatus = getActiveRunStatus(m.path); + const tagsEntry = readRunTags(m.path); + return { + filename: m.filename, + display_name: m.displayName, + path: m.path, + timestamp: m.timestamp, + test_count: m.testCount, + pass_rate: passRate, + avg_score: m.avgScore, + size_bytes: m.sizeBytes, + source: m.source, + ...(target && { target }), + ...(experiment && { experiment }), + ...(tagsEntry && { tags: tagsEntry.tags }), + ...(liveStatus && { status: liveStatus }), + }; + }); + const page = paginateRuns(runs, cursor, limit); + return c.json({ + runs: page.runs, + ...(page.nextCursor ? { next_cursor: page.nextCursor } : {}), }); } diff --git a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts index d2412643..a2e69585 100644 --- a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts +++ b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts @@ -1,66 +1,77 @@ -import { mkdir, readFile, rm, writeFile } from 'node:fs/promises'; +import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; import { join } from 'node:path'; -import { afterEach, describe, expect, it } from 'vitest'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; const FIXTURE_DIR = join(import.meta.dirname, 'fixtures'); -const OUT_DIR = join(import.meta.dirname, '__tmp_pipeline_e2e__'); const CLI_ENTRY = join(import.meta.dirname, '../../../../src/cli.ts'); const EVAL_PATH = join(FIXTURE_DIR, 'input-test.eval.yaml'); +const PIPELINE_E2E_TIMEOUT_MS = 60_000; describe('eval pipeline e2e', () => { + let outDir: string; + + beforeEach(async () => { + outDir = await mkdtemp(join(tmpdir(), 'agentv-pipeline-e2e-')); + }); + afterEach(async () => { - await rm(OUT_DIR, { recursive: true, force: true }); + await rm(outDir, { recursive: true, force: true }); }); - it('runs full input → grade → bench pipeline', async () => { - const { execa } = await import('execa'); + it( + 'runs full input → grade → bench pipeline', + async () => { + const { execa } = await import('execa'); - // Step 1: pipeline input - await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]); - const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8')); - expect(manifest.test_ids).toEqual(['test-01']); + // Step 1: pipeline input + await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', outDir]); + const manifest = JSON.parse(await readFile(join(outDir, 'manifest.json'), 'utf8')); + expect(manifest.test_ids).toEqual(['test-01']); - // Step 2: Write mock response.md (simulating target execution) - await writeFile(join(OUT_DIR, 'input-test', 'test-01', 'response.md'), 'hello world response'); + // Step 2: Write mock response.md (simulating target execution) + await writeFile(join(outDir, 'input-test', 'test-01', 'response.md'), 'hello world response'); - // Step 3: pipeline grade - await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', OUT_DIR]); - const gradeResult = JSON.parse( - await readFile( - join(OUT_DIR, 'input-test', 'test-01', 'code_grader_results', 'contains_hello.json'), - 'utf8', - ), - ); - expect(gradeResult.score).toBe(1); + // Step 3: pipeline grade + await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', outDir]); + const gradeResult = JSON.parse( + await readFile( + join(outDir, 'input-test', 'test-01', 'code_grader_results', 'contains_hello.json'), + 'utf8', + ), + ); + expect(gradeResult.score).toBe(1); - // Step 4: Write mock LLM grader result to disk, then run pipeline bench - const llmResultsDir = join(OUT_DIR, 'input-test', 'test-01', 'llm_grader_results'); - await mkdir(llmResultsDir, { recursive: true }); - await writeFile( - join(llmResultsDir, 'relevance.json'), - JSON.stringify({ - score: 0.9, - assertions: [{ text: 'Response is relevant', passed: true, evidence: 'echoes input' }], - }), - ); - await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]); + // Step 4: Write mock LLM grader result to disk, then run pipeline bench + const llmResultsDir = join(outDir, 'input-test', 'test-01', 'llm_grader_results'); + await mkdir(llmResultsDir, { recursive: true }); + await writeFile( + join(llmResultsDir, 'relevance.json'), + JSON.stringify({ + score: 0.9, + assertions: [{ text: 'Response is relevant', passed: true, evidence: 'echoes input' }], + }), + ); + await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', outDir]); - // Verify final artifacts - const grading = JSON.parse( - await readFile(join(OUT_DIR, 'input-test', 'test-01', 'grading.json'), 'utf8'), - ); - expect(grading.graders).toHaveLength(2); - expect(grading.summary.pass_rate).toBeGreaterThan(0); + // Verify final artifacts + const grading = JSON.parse( + await readFile(join(outDir, 'input-test', 'test-01', 'grading.json'), 'utf8'), + ); + expect(grading.graders).toHaveLength(2); + expect(grading.summary.pass_rate).toBeGreaterThan(0); - const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8'); - const indexLines = indexContent - .trim() - .split('\n') - .map((line) => JSON.parse(line)); - expect(indexLines).toHaveLength(1); - expect(indexLines[0].test_id).toBe('test-01'); + const indexContent = await readFile(join(outDir, 'index.jsonl'), 'utf8'); + const indexLines = indexContent + .trim() + .split('\n') + .map((line) => JSON.parse(line)); + expect(indexLines).toHaveLength(1); + expect(indexLines[0].test_id).toBe('test-01'); - const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8')); - expect(benchmark.run_summary).toBeDefined(); - }, 30_000); + const benchmark = JSON.parse(await readFile(join(outDir, 'benchmark.json'), 'utf8')); + expect(benchmark.run_summary).toBeDefined(); + }, + PIPELINE_E2E_TIMEOUT_MS, + ); }); diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 75f286fb..446460f4 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -1,4 +1,5 @@ import { afterEach, beforeEach, describe, expect, it, spyOn } from 'bun:test'; +import { execSync } from 'node:child_process'; import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; import os from 'node:os'; import { tmpdir } from 'node:os'; @@ -58,6 +59,79 @@ function toJsonl(...records: object[]): string { return `${records.map((r) => JSON.stringify(r)).join('\n')}\n`; } +function cleanGitEnv(): Record { + const env: Record = {}; + for (const [key, value] of Object.entries(process.env)) { + if (value !== undefined && !(key.startsWith('GIT_') && key !== 'GIT_SSH_COMMAND')) { + env[key] = value; + } + } + return env; +} + +function git(command: string, cwd: string): string { + return execSync(command, { cwd, encoding: 'utf8', env: cleanGitEnv() }).trim(); +} + +function initializeRemoteRepo(rootDir: string): { remoteDir: string; cloneDir: string } { + const remoteDir = path.join(rootDir, 'results-remote.git'); + git(`git init --bare --initial-branch=main --quiet "${remoteDir}"`, rootDir); + + const seedDir = path.join(rootDir, 'results-seed'); + git(`git clone --quiet "${remoteDir}" "${seedDir}"`, rootDir); + git('git config user.email "test@example.com"', seedDir); + git('git config user.name "Test User"', seedDir); + writeFileSync(path.join(seedDir, 'README.md'), '# results repo\n'); + git('git add README.md && git commit --quiet -m "seed repo"', seedDir); + git('git push --quiet origin main', seedDir); + + const cloneDir = path.join(rootDir, 'results-clone'); + git(`git clone --quiet "${remoteDir}" "${cloneDir}"`, rootDir); + git('git config user.email "test@example.com"', cloneDir); + git('git config user.name "Test User"', cloneDir); + + return { remoteDir, cloneDir }; +} + +function writeRemoteRunArtifact( + cloneDir: string, + experiment: string, + timestamp: string, + resultRecord: object, +): string { + const isoTimestamp = timestamp.replace( + /^(\d{4}-\d{2}-\d{2})T(\d{2})-(\d{2})-(\d{2})-(\d{3})Z$/, + '$1T$2:$3:$4.$5Z', + ); + const runDir = path.join(cloneDir, 'runs', experiment, timestamp); + mkdirSync(runDir, { recursive: true }); + writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(resultRecord)); + writeFileSync( + path.join(runDir, 'benchmark.json'), + JSON.stringify( + { + metadata: { + timestamp: isoTimestamp, + experiment, + targets: ['gpt-4o'], + tests_run: ['test-greeting'], + }, + run_summary: { + 'gpt-4o': { + pass_rate: { mean: 1 }, + }, + }, + }, + null, + 2, + ), + ); + git(`git add "${runDir}" && git commit --quiet -m "add ${experiment}"`, cloneDir); + git('git push --quiet origin main', cloneDir); + git('git fetch --quiet origin --prune', cloneDir); + return `${experiment}::${timestamp}`; +} + // ── resolveSourceFile ──────────────────────────────────────────────────── describe('resolveSourceFile', () => { @@ -392,6 +466,12 @@ describe('serve app', () => { // ── GET /api/runs ─────────────────────────────────────────────────── describe('GET /api/runs', () => { + function createLocalRun(baseDir: string, filename: string, ...records: object[]) { + const runDir = path.join(baseDir, '.agentv', 'results', 'runs', filename); + mkdirSync(runDir, { recursive: true }); + writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(...records)); + } + it('returns empty runs list for temp directory', async () => { const app = createApp([], tempDir, undefined, undefined, { studioDir }); const res = await app.request('/api/runs'); @@ -400,6 +480,65 @@ describe('serve app', () => { expect(data.runs).toEqual([]); }); + it('supports cursor pagination when limit is provided', async () => { + createLocalRun(tempDir, '2026-03-25T10-00-00-000Z', RESULT_A); + createLocalRun(tempDir, '2026-03-25T11-00-00-000Z', RESULT_A); + createLocalRun(tempDir, '2026-03-25T12-00-00-000Z', RESULT_A); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + + const firstRes = await app.request('/api/runs?limit=2'); + expect(firstRes.status).toBe(200); + const firstPage = (await firstRes.json()) as { + runs: Array<{ filename: string }>; + next_cursor?: string; + }; + expect(firstPage.runs.map((run) => run.filename)).toEqual([ + '2026-03-25T12-00-00-000Z', + '2026-03-25T11-00-00-000Z', + ]); + expect(firstPage.next_cursor).toBe('2026-03-25T11-00-00-000Z'); + + const secondRes = await app.request( + `/api/runs?limit=2&cursor=${encodeURIComponent(firstPage.next_cursor ?? '')}`, + ); + expect(secondRes.status).toBe(200); + const secondPage = (await secondRes.json()) as { + runs: Array<{ filename: string }>; + next_cursor?: string; + }; + expect(secondPage.runs.map((run) => run.filename)).toEqual(['2026-03-25T10-00-00-000Z']); + expect(secondPage.next_cursor).toBeUndefined(); + }); + + it('returns an empty page for unknown cursors', async () => { + createLocalRun(tempDir, '2026-03-25T10-00-00-000Z', RESULT_A); + createLocalRun(tempDir, '2026-03-25T11-00-00-000Z', RESULT_A); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request('/api/runs?limit=1&cursor=missing-run'); + + expect(res.status).toBe(200); + const data = (await res.json()) as { + runs: Array<{ filename: string }>; + next_cursor?: string; + }; + expect(data.runs).toEqual([]); + expect(data.next_cursor).toBeUndefined(); + }); + + it('rejects invalid pagination limits', async () => { + createLocalRun(tempDir, '2026-03-25T10-00-00-000Z', RESULT_A); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request('/api/runs?limit=0'); + + expect(res.status).toBe(400); + await expect(res.json()).resolves.toEqual({ + error: 'limit must be a positive integer', + }); + }); + it('tags local runs with source metadata', async () => { const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); mkdirSync(runsDir, { recursive: true }); @@ -501,18 +640,15 @@ describe('serve app', () => { writeFileSync( path.join(tempDir, '.agentv', 'config.yaml'), `results: + mode: github repo: EntityProcess/agentv-evals - path: autopilot-dev/runs `, ); const remoteRunDir = path.join( process.env.AGENTV_HOME, - 'cache', - 'results-repo', + 'results', 'EntityProcess-agentv-evals', - 'repo', - 'autopilot-dev', 'runs', 'default', '2026-03-26T10-00-00-000Z', @@ -540,6 +676,53 @@ describe('serve app', () => { } } }); + + it('lists and loads git-native remote runs from the configured clone path', async () => { + const { remoteDir, cloneDir } = initializeRemoteRepo(tempDir); + const runId = writeRemoteRunArtifact( + cloneDir, + 'green-uat', + '2026-03-26T10-00-00-000Z', + RESULT_A, + ); + + mkdirSync(path.join(tempDir, '.agentv'), { recursive: true }); + writeFileSync( + path.join(tempDir, '.agentv', 'config.yaml'), + `results: + mode: github + repo: file://${remoteDir} + path: ${cloneDir} +`, + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + + const listRes = await app.request('/api/runs'); + expect(listRes.status).toBe(200); + const listData = (await listRes.json()) as { + runs: Array<{ filename: string; source: string; experiment?: string; pass_rate?: number }>; + }; + expect(listData.runs).toHaveLength(1); + expect(listData.runs[0]).toMatchObject({ + filename: `remote::${runId}`, + source: 'remote', + experiment: 'green-uat', + pass_rate: 1, + }); + + const detailRes = await app.request( + `/api/runs/${encodeURIComponent(listData.runs[0].filename)}`, + ); + expect(detailRes.status).toBe(200); + const detailData = (await detailRes.json()) as { + source: string; + results: Array<{ test_id?: string; testId?: string }>; + }; + expect(detailData.source).toBe('remote'); + expect(detailData.results).toHaveLength(1); + expect(detailData.results[0]).toMatchObject({ testId: 'test-greeting' }); + }, 15000); }); describe('GET /api/projects/all-runs', () => { @@ -581,29 +764,42 @@ describe('serve app', () => { describe('GET /api/remote/status', () => { it('reports configured remote status with graceful local-only fallback', async () => { - mkdirSync(path.join(tempDir, '.agentv'), { recursive: true }); - writeFileSync( - path.join(tempDir, '.agentv', 'config.yaml'), - `results: + const previousHome = process.env.AGENTV_HOME; + process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home-status'); + + try { + mkdirSync(path.join(tempDir, '.agentv'), { recursive: true }); + writeFileSync( + path.join(tempDir, '.agentv', 'config.yaml'), + `results: + mode: github repo: EntityProcess/agentv-evals - path: autopilot-dev/runs `, - ); + ); - const app = createApp([], tempDir, tempDir, undefined, { studioDir }); - const res = await app.request('/api/remote/status'); + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request('/api/remote/status'); - expect(res.status).toBe(200); - const data = (await res.json()) as { - configured: boolean; - available: boolean; - repo: string; - path: string; - }; - expect(data.configured).toBe(true); - expect(data.available).toBe(false); - expect(data.repo).toBe('EntityProcess/agentv-evals'); - expect(data.path).toBe('autopilot-dev/runs'); + expect(res.status).toBe(200); + const data = (await res.json()) as { + configured: boolean; + available: boolean; + repo: string; + path: string; + }; + expect(data.configured).toBe(true); + expect(data.available).toBe(false); + expect(data.repo).toBe('EntityProcess/agentv-evals'); + expect(data.path).toBe( + path.join(tempDir, 'agentv-home-status', 'results', 'EntityProcess-agentv-evals'), + ); + } finally { + if (previousHome === undefined) { + process.env.AGENTV_HOME = undefined; + } else { + process.env.AGENTV_HOME = previousHome; + } + } }); }); diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index 3ada5bb4..8db576c6 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -20,7 +20,6 @@ const __dirname = path.dirname(__filename); const projectRoot = path.resolve(__dirname, '../../..'); const CLI_ENTRY = path.join(projectRoot, 'apps/cli/src/cli.ts'); const MOCK_RUNNER = path.join(projectRoot, 'apps/cli/test/fixtures/mock-run-evaluation.ts'); - async function createFixture(): Promise { const baseDir = await mkdtemp(path.join(tmpdir(), 'agentv-cli-test-')); const suiteDir = path.join(baseDir, 'suite'); @@ -201,22 +200,6 @@ async function readDiagnostics(fixture: EvalFixture): Promise { - it('documents the bare `eval` shorthand in eval help', async () => { - const fixture = await createFixture(); - try { - const { stdout } = await runCli(fixture, ['eval', '--help']); - - expect(stdout).toContain('Evaluation commands.'); - expect(stdout).toContain('agentv eval '); - expect(stdout).toContain('agentv eval run '); - expect(stdout).toContain('- run'); - expect(stdout).toContain('- assert'); - expect(stdout).toContain('- aggregate'); - } finally { - await rm(fixture.baseDir, { recursive: true, force: true }); - } - }); - it('writes results, summary, and prompt dumps using default directories', async () => { const fixture = await createFixture(); try { diff --git a/apps/studio/src/components/RunList.tsx b/apps/studio/src/components/RunList.tsx index 974d169a..966cf991 100644 --- a/apps/studio/src/components/RunList.tsx +++ b/apps/studio/src/components/RunList.tsx @@ -13,6 +13,7 @@ */ import type React from 'react'; +import { useEffect, useRef } from 'react'; import { Link } from '@tanstack/react-router'; @@ -26,6 +27,9 @@ interface RunListProps { runs: RunMeta[]; projectId?: string; emptyMessage?: React.ReactNode; + hasNextPage?: boolean; + isFetchingNextPage?: boolean; + onLoadMore?: () => void; } function formatDate(ts: string | undefined | null): { date: string; full: string } { @@ -48,9 +52,50 @@ function formatDate(ts: string | undefined | null): { date: string; full: string } } -export function RunList({ runs, projectId, emptyMessage }: RunListProps) { +export function RunList({ + runs, + projectId, + emptyMessage, + hasNextPage = false, + isFetchingNextPage = false, + onLoadMore, +}: RunListProps) { const { data: config } = useStudioConfig(projectId); const passThreshold = config?.threshold ?? DEFAULT_PASS_THRESHOLD; + const sentinelRef = useRef(null); + const requestingNextPageRef = useRef(false); + + useEffect(() => { + if (!isFetchingNextPage) { + requestingNextPageRef.current = false; + } + }, [isFetchingNextPage]); + + useEffect(() => { + if (!hasNextPage || !onLoadMore) { + return; + } + const node = sentinelRef.current; + if (!node) { + return; + } + + const observer = new IntersectionObserver( + (entries) => { + if ( + entries.some((entry) => entry.isIntersecting) && + !isFetchingNextPage && + !requestingNextPageRef.current + ) { + requestingNextPageRef.current = true; + onLoadMore(); + } + }, + { rootMargin: '200px 0px' }, + ); + observer.observe(node); + return () => observer.disconnect(); + }, [hasNextPage, isFetchingNextPage, onLoadMore]); if (runs.length === 0) { return ( @@ -155,6 +200,13 @@ export function RunList({ runs, projectId, emptyMessage }: RunListProps) { ); })} + {(hasNextPage || isFetchingNextPage) && ( + + + {isFetchingNextPage ? 'Loading more runs…' : 'Scroll to load more…'} + + + )} diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index 883663c8..67e51fc6 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -5,7 +5,12 @@ * and the same-origin Hono server serves in production. */ -import { queryOptions, useQuery } from '@tanstack/react-query'; +import { + infiniteQueryOptions, + queryOptions, + useInfiniteQuery, + useQuery, +} from '@tanstack/react-query'; import type { CategoriesResponse, @@ -59,12 +64,40 @@ async function fetchText(url: string): Promise { // ── Query option factories ────────────────────────────────────────────── +const RUNS_PAGE_LIMIT = 50; + +function buildRunListUrl(baseUrl: string, cursor?: string): string { + const params = new URLSearchParams({ limit: String(RUNS_PAGE_LIMIT) }); + if (cursor) { + params.set('cursor', cursor); + } + return `${baseUrl}?${params.toString()}`; +} + +function flattenRunListPages(pages: RunListResponse[] | undefined): RunListResponse { + if (!pages || pages.length === 0) { + return { runs: [] }; + } + return { + runs: pages.flatMap((page) => page.runs), + next_cursor: pages.at(-1)?.next_cursor, + }; +} + export const runListOptions = queryOptions({ queryKey: ['runs'], queryFn: () => fetchJson('/api/runs'), refetchInterval: 5_000, }); +export const infiniteRunListOptions = infiniteQueryOptions({ + queryKey: ['runs', 'infinite'], + initialPageParam: undefined as string | undefined, + queryFn: ({ pageParam }) => fetchJson(buildRunListUrl('/api/runs', pageParam)), + getNextPageParam: (lastPage) => lastPage.next_cursor, + refetchInterval: 5_000, +}); + export function runDetailOptions(filename: string) { return queryOptions({ queryKey: ['runs', filename], @@ -206,6 +239,14 @@ export function useRunList() { return useQuery(runListOptions); } +export function useInfiniteRunList() { + const query = useInfiniteQuery(infiniteRunListOptions); + return { + ...query, + data: flattenRunListPages(query.data?.pages), + }; +} + export function useRunDetail(filename: string) { return useQuery(runDetailOptions(filename)); } @@ -327,10 +368,30 @@ export function projectRunListOptions(projectId: string) { }); } +export function infiniteProjectRunListOptions(projectId: string) { + return infiniteQueryOptions({ + queryKey: ['projects', projectId, 'runs', 'infinite'], + initialPageParam: undefined as string | undefined, + queryFn: ({ pageParam }) => + fetchJson(buildRunListUrl(`${projectApiBase(projectId)}/runs`, pageParam)), + getNextPageParam: (lastPage) => lastPage.next_cursor, + enabled: !!projectId, + refetchInterval: 5_000, + }); +} + export function useProjectRunList(projectId: string) { return useQuery(projectRunListOptions(projectId)); } +export function useInfiniteProjectRunList(projectId: string) { + const query = useInfiniteQuery(infiniteProjectRunListOptions(projectId)); + return { + ...query, + data: flattenRunListPages(query.data?.pages), + }; +} + export function projectRunDetailOptions(projectId: string, filename: string) { return queryOptions({ queryKey: ['projects', projectId, 'runs', filename], diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index 748300a6..595babb0 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -32,6 +32,7 @@ export interface RunMeta { export interface RunListResponse { runs: RunMeta[]; + next_cursor?: string; } export interface TokenUsage { @@ -257,7 +258,7 @@ export interface RemoteStatusResponse { configured: boolean; available: boolean; repo?: string; - cache_dir?: string; + local_dir?: string; path?: string; auto_push?: boolean; branch_prefix?: string; diff --git a/apps/studio/src/routes/index.tsx b/apps/studio/src/routes/index.tsx index 8461ab54..921889c6 100644 --- a/apps/studio/src/routes/index.tsx +++ b/apps/studio/src/routes/index.tsx @@ -22,12 +22,13 @@ import { syncRemoteResultsApi, useCompare, useEvalRuns, + useInfiniteRunList, useProjectList, useRemoteStatus, - useRunList, useStudioConfig, } from '~/lib/api'; import { type StudioTabId, resolveIndexRoute } from '~/lib/navigation'; +import type { RunMeta } from '~/lib/types'; type TabId = StudioTabId; const tabs: { id: TabId; label: string }[] = [ @@ -184,7 +185,8 @@ function SingleProjectHome() { const tab = searchParams.tab as TabId | undefined; const navigate = useNavigate(); const queryClient = useQueryClient(); - const { data, isLoading, error } = useRunList(); + const { data, isLoading, error, hasNextPage, fetchNextPage, isFetchingNextPage } = + useInfiniteRunList(); const { data: remoteStatus } = useRemoteStatus(); const { data: config } = useStudioConfig(); const [showRunEval, setShowRunEval] = useState(false); @@ -265,6 +267,9 @@ function SingleProjectHome() { remoteStatus={remoteStatus} syncInFlight={syncInFlight} onSyncRemote={handleSyncRemote} + hasNextPage={hasNextPage} + isFetchingNextPage={isFetchingNextPage} + onLoadMore={() => void fetchNextPage()} /> )} {activeTab === 'experiments' && } @@ -298,8 +303,11 @@ function RunsTabContent({ remoteStatus, syncInFlight, onSyncRemote, + hasNextPage, + isFetchingNextPage, + onLoadMore, }: { - runs: NonNullable['data']>['runs']; + runs: RunMeta[]; isLoading: boolean; error: Error | null; sourceFilter: RunSourceFilter; @@ -307,6 +315,9 @@ function RunsTabContent({ remoteStatus: ReturnType['data']; syncInFlight: boolean; onSyncRemote: () => void; + hasNextPage: boolean | undefined; + isFetchingNextPage: boolean; + onLoadMore: () => void; }) { if (isLoading) { return ; @@ -332,6 +343,9 @@ function RunsTabContent({ /> ('all'); @@ -195,7 +196,13 @@ function ProjectRunsTab({ projectId }: { projectId: string }) { syncInFlight={syncInFlight} onSync={handleSyncRemote} /> - + void fetchNextPage()} + /> ); } diff --git a/docs/plans/git-native-results-goal.md b/docs/plans/git-native-results-goal.md new file mode 100644 index 00000000..d5db62ff --- /dev/null +++ b/docs/plans/git-native-results-goal.md @@ -0,0 +1,42 @@ +# Goal: Complete git-native-results PR (#1261) + +## Objective +Implement the git-native results storage architecture and land PR #1261 as a clean, tested, manually verified change. + +## Success Criteria +- All implementation passes completed per design doc +- Full test suite green (unit + integration + existing 1782 core + 553 CLI tests) +- E2E manual test using agent-browser against real test results repo +- Red/green UAT documented before review +- No regressions + +## Work Location +- Worktree: `agentv.worktrees/git-native-results/` +- Branch: `feat/git-native-results` + +## Key Decisions Confirmed +- Dedicated results repo model → write directly to `main` of results repo (no separate branch needed) +- Use raw `git` subprocess (not go-git) for ls-tree / cat-file path +- Follow exact order in design doc + +## Non-Goals +- P5 zero-config mode +- Caching +- Multi-mode beyond github + +## Verification +1. Automated tests +2. Manual agent-browser E2E in Studio +3. Performance check with 500+ runs repo +4. Lint + typecheck clean + +Owner: Agent + Chris T + +## Latest Progress (2026-05-21) + +- Docker ownership fix implemented in docker-compose.yml (`user: "${UID}:${GID}"`) +- Write path (`commitAndPushRun`) largely complete via parallel work +- Read path functional but needs hardening +- Bun dependencies reinstalled in worktree +- GitHub Actions currently failing on dependency resolution in CI +- Next focus: Fix CI, add tests, implement pagination diff --git a/docs/plans/git-native-results.md b/docs/plans/git-native-results.md new file mode 100644 index 00000000..1d625f3c --- /dev/null +++ b/docs/plans/git-native-results.md @@ -0,0 +1,162 @@ +# Git-native results storage + +**Status**: design approved, implementation pending +**Tracks**: issue #1259 (supersedes closed PR #1260) +**Scope**: single PR; breaking changes accepted (no production users yet) + +--- + +## Why + +`/api/runs` polls every 5s and does O(N) per-manifest reads (`readdir` + `statSync` + `loadResultFile` per run). At hundreds of runs it stalls; at thousands it falls over. The original PR #1260 tried to fix this with an append-only `index/runs.jsonl` file, which works but adds a second source of truth that can drift, grows forever, and requires a sha-amend dance plus a `reindex` migration command. + +After comparing with **entireio** (single-ref + git tree as index) and **skillfully** (explicit `sourceMode = github_import` pattern with PR-based writes for human-curated content), the cleaner architecture treats **git as the canonical store**, not as a transport layer. + +## Core idea + +The git tree IS the index. `git ls-tree -r origin/main -- runs/` lists every run path without reading any blob. `git cat-file --batch` reads existing `benchmark.json` blobs in one subprocess call. No separate index file. No drift. Natural pruning when runs are deleted. With `--filter=blob:none` clone, individual run blobs are only fetched lazily when a user opens the detail view. + +## Architecture + +### Storage + +- The configured remote `results.repo` is **the** storage location. +- The local clone at `results.path` (filesystem path) is the working copy. +- No more `.agentv/results/runs/` writes in the source project. No more gitignored results. + +```yaml +# config.yaml +results: + mode: github # required, only valid value today + repo: myorg/eval-results # remote + path: ~/data/agentv-results # optional; default ~/.agentv/results// + auto_push: true # default +``` + +`mode: github` is explicit (extension point; mirrors skillfully's `sourceMode` pattern). `path` is the **local filesystem location** of the clone (breaking change — was previously the subdir within the remote repo). Runs always land at `/runs///` regardless. + +### Writes + +Every `agentv eval` is one atomic operation: + +1. `git fetch origin --prune` (refresh; no checkout) +2. Write artifacts into working tree at `/runs///` +3. `git add runs///` +4. `git commit -m "" -m "Agentv-Run: <run-id>"` (P6 trailer baked in) +5. If `auto_push`: `git push origin HEAD:main` with retry-on-non-fast-forward (rebase + retry) + +Each run is one commit. Files are unique to that run, so rebases never content-conflict. + +### Reads + +**Listing** (replaces `listResultFilesFromRunsDir`): +- `git ls-tree -r origin/main -- runs/` → filter for `benchmark.json` paths +- `git cat-file --batch` → read those blobs in one subprocess +- Derive `run_id` from path (same logic as current `buildRunId`) +- Sort by timestamp descending +- Apply cursor pagination + +**Detail view file reads** (replaces `readFileSync(meta.path)`): +- Committed: `git cat-file -p origin/main:runs/.../<file>` +- In-progress (post-write, pre-commit): `readFileSync(<path>)` from working tree + +**In-progress detection**: between artifact write and commit, files exist only in the working tree. `git status --porcelain runs/` surfaces them; merge with the committed list for the Studio runs view. + +### Sync + +- `agentv eval` does its own fetch + push (no separate sync needed for own work) +- `agentv results sync` = `git fetch origin --prune` (refresh view of others' work) +- No more `git checkout`, no more `git pull --ff-only` +- Studio polls `/api/runs` which reads from git object DB (already current after the most recent fetch) + +### Pagination + +`/api/runs?limit=50&cursor=<run_id>`: +- Cursor is the `run_id` of the last item from the previous page +- Server reads the full sorted list (one `git ls-tree` + one `git cat-file --batch`), finds the cursor, slices `[cursorIdx+1 : cursorIdx+1+limit]`, returns `next_cursor` if more remain +- Studio uses `useInfiniteQuery` + an `IntersectionObserver` sentinel row + +## Implementation passes + +The PR is large but bounded. Suggested order within the single PR: + +### Pass 1 — config + paths + +- Update `ResultsConfig` schema: require `mode: github`, repurpose `path` as filesystem location +- Rename `getResultsRepoCachePaths` → `getResultsRepoLocalPaths` +- Rename `cache_dir` → `local_dir` in `ResultsRepoStatus` (wire format too) +- Add config validation: refuse old-style `path: runs` values with migration message + +### Pass 2 — write path + +- Replace `.agentv/results/runs/` writes with direct writes to `<results.path>/runs/...` +- `directPushResults` becomes the only write path (rename to `commitAndPushRun` since it's no longer just a "direct push" mode) +- Add `Agentv-Run:` commit trailer +- Drop `git checkout` from `updateCacheRepo` — only `git fetch --prune` remains +- Rename `updateCacheRepo` → `fetchResultsRepo` + +### Pass 3 — read path + +- New `listResultFilesFromGitTree(repoDir, baseBranch)` using `git ls-tree` + `git cat-file --batch` on `benchmark.json` blobs +- Replace `listResultFilesFromRunsDir` calls for remote runs with the new function +- Detail view reads in `serve.ts` use `git cat-file -p <ref>:<path>` for committed runs +- Working-tree readdir for in-progress runs (detected via `git status --porcelain`) +- Drop `loadLightweightResults` enrichment loop in `handleRuns` — `benchmark.json` already has `target`, `experiment`, and `pass_rate` + +### Pass 4 — pagination + +- `/api/runs` accepts `limit` and `cursor` query params +- Server slices the sorted list by cursor, returns `next_cursor` +- `RunListResponse` gets `next_cursor?: string` +- Studio: `runListOptions` → `infiniteQueryOptions` +- `RunList.tsx`: flatten pages, add `IntersectionObserver` sentinel + +### Pass 5 — cleanup + +- Remove the entire P1 PR scope (closed PR #1260): `RunIndexEntry`, `appendToRunIndex`, `readRunIndex`, `reindexResultsRepo`, `agentv results reindex` command, `index/runs.jsonl` writes +- Remove `localResults` listing — local-only mode is no longer supported +- Remove `SourcedResultFileMeta.source` field — runs are no longer "local" or "remote", they're either committed or in-progress +- Update docs site (`apps/web/src/content/docs/`) +- Update skill files (`plugins/agentv-dev/skills/agentv-eval-builder/`) +- Update examples that hardcoded `.agentv/results/runs/` paths + +## Breaking changes + +| Change | Impact | +|--------|--------| +| `results.repo` becomes required | Users without a results repo can't run evals until they configure one | +| `results.path` repurposed (subdir → filesystem path) | Existing configs with `path: runs` fail loudly with migration message | +| No more `.agentv/results/runs/` writes | Project-local results no longer exist; everything lives in the configured `path` | +| `cache_dir` → `local_dir` in status responses | Studio + any external scripts reading status need to update | +| `SourcedResultFileMeta.source` removed | Studio "source" badge becomes "in progress / shared" | + +Breaking changes accepted because no production users yet. Document in release notes; require fresh config to upgrade. + +## Test plan + +- Unit tests for `git ls-tree` + `git cat-file --batch` parsing helpers +- Integration test that spins up a tmp git repo, writes runs via the new write path, lists via the new read path, asserts results +- Pagination unit tests (cursor in/out of bounds, exact-boundary cases) +- E2E: run an actual eval against a real (test-scoped) results repo, verify the commit lands with the `Agentv-Run:` trailer, `git ls-tree` shows the run, Studio renders it + +## Deferred to future PRs + +- **P5 zero-config same-repo mode** — write to `refs/agentv/runs/v1` in the source repo when no `results.repo` is configured. Independent feature; design pattern works the same. +- **Multi-mode support** — if a cloud Studio gets built later, `mode: cloud` would mirror skillfully's "managed in Skillfully" mode. The current explicit `mode: github` field is the extension point. +- **PR-based publishing** — for human-curated content. Eval results are machine-generated, so direct commit is correct. If users want review-before-merge for sensitive evals (e.g., regulatory benchmarks), add `share: auto-pr` later. +- **In-memory list caching** — P2 from #1259. The git-object-DB read path is fast enough that caching is not needed today. Revisit if profiling shows it's a bottleneck. + +## Open implementation questions + +1. **Branch model**: `origin/main` or a dedicated `origin/agentv-runs/main`? Current vote: `main`, since this is a dedicated results repo. +2. **What to do on `git fetch` failures during `agentv eval`**? Current vote: warn, proceed with stale local state, surface the error in Studio. Don't block the eval — local commit always works. +3. **`gh` CLI dependency**: stays scoped to existing PR-related code paths. The new git-native flow uses raw `git` only. + +## What this PR does NOT do + +- Doesn't add a separate index file (the index IS the git tree) +- Doesn't ship a `reindex` migration command (nothing to backfill — `benchmark.json` already exists per run) +- Doesn't change the artifact format (`benchmark.json`, `index.jsonl`, per-test dirs stay as-is) +- Doesn't add server-side caching (deferred) +- Doesn't add PR-based publishing (deferred) +- Doesn't touch the source repo's commit history (only the configured `results.repo`) diff --git a/packages/core/src/evaluation/loaders/config-loader.ts b/packages/core/src/evaluation/loaders/config-loader.ts index b7603f2d..462a79e7 100644 --- a/packages/core/src/evaluation/loaders/config-loader.ts +++ b/packages/core/src/evaluation/loaders/config-loader.ts @@ -37,8 +37,10 @@ export type ExecutionDefaults = { }; export type ResultsConfig = { + readonly mode: 'github'; readonly repo: string; - readonly path: string; + /** Local filesystem path for the results clone. Optional; defaults to ~/.agentv/results/<slug>/. */ + readonly path?: string; readonly auto_push?: boolean; readonly branch_prefix?: string; }; @@ -558,6 +560,16 @@ export function parseExecutionDefaults( return Object.keys(result).length > 0 ? (result as ExecutionDefaults) : undefined; } +function isFilesystemPath(p: string): boolean { + return ( + p.startsWith('/') || + p.startsWith('~/') || + p.startsWith('~\\') || + p === '~' || + /^[A-Za-z]:[/\\]/.test(p) + ); +} + export function parseResultsConfig(raw: unknown, configPath: string): ResultsConfig | undefined { if (raw === undefined || raw === null) { return undefined; @@ -568,17 +580,32 @@ export function parseResultsConfig(raw: unknown, configPath: string): ResultsCon } const obj = raw as Record<string, unknown>; - const repo = typeof obj.repo === 'string' ? obj.repo.trim() : ''; - const resultsPath = typeof obj.path === 'string' ? obj.path.trim() : ''; + if (obj.mode !== 'github') { + logWarning(`Invalid results.mode in ${configPath}, expected 'github'`); + return undefined; + } + + const repo = typeof obj.repo === 'string' ? obj.repo.trim() : ''; if (!repo) { logWarning(`Invalid results.repo in ${configPath}, expected non-empty string`); return undefined; } - if (!resultsPath) { - logWarning(`Invalid results.path in ${configPath}, expected non-empty string`); - return undefined; + let resultsPath: string | undefined; + if (obj.path !== undefined) { + if (typeof obj.path !== 'string' || obj.path.trim().length === 0) { + logWarning(`Invalid results.path in ${configPath}, expected non-empty string`); + return undefined; + } + const trimmedPath = obj.path.trim(); + if (!isFilesystemPath(trimmedPath)) { + logWarning( + `Invalid results.path in ${configPath}: '${trimmedPath}' looks like a repo subdirectory. results.path now specifies the local filesystem directory for the clone (e.g., ~/data/agentv-results). Remove 'path' to use the default or set an absolute/home-relative path.`, + ); + return undefined; + } + resultsPath = trimmedPath; } if (obj.auto_push !== undefined && typeof obj.auto_push !== 'boolean') { @@ -596,8 +623,9 @@ export function parseResultsConfig(raw: unknown, configPath: string): ResultsCon } return { + mode: 'github', repo, - path: resultsPath, + ...(resultsPath !== undefined && { path: resultsPath }), ...(typeof obj.auto_push === 'boolean' && { auto_push: obj.auto_push }), ...(branchPrefix && { branch_prefix: branchPrefix }), }; diff --git a/packages/core/src/evaluation/results-repo.ts b/packages/core/src/evaluation/results-repo.ts index 04419785..be0f0aa3 100644 --- a/packages/core/src/evaluation/results-repo.ts +++ b/packages/core/src/evaluation/results-repo.ts @@ -1,4 +1,4 @@ -import { execFile } from 'node:child_process'; +import { execFile, spawn } from 'node:child_process'; import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; import { cp, mkdtemp, readdir, rm, stat } from 'node:fs/promises'; import os from 'node:os'; @@ -10,7 +10,7 @@ import type { ResultsConfig } from './loaders/config-loader.js'; const execFileAsync = promisify(execFile); -export interface ResultsRepoCachePaths { +export interface ResultsRepoLocalPaths { readonly rootDir: string; readonly repoDir: string; readonly statusFile: string; @@ -23,7 +23,7 @@ export interface ResultsRepoStatus { readonly path?: string; readonly auto_push?: boolean; readonly branch_prefix?: string; - readonly cache_dir?: string; + readonly local_dir?: string; readonly last_synced_at?: string; readonly last_error?: string; } @@ -61,10 +61,22 @@ function withFriendlyGitHubAuthError(error: unknown): Error { return new Error(message); } +function expandHome(p: string): string { + if (p === '~' || p.startsWith('~/') || p.startsWith('~\\')) { + return path.join(os.homedir(), p.slice(1)); + } + return p; +} + export function normalizeResultsConfig(config: ResultsConfig): Required<ResultsConfig> { + const repo = config.repo.trim(); + const resolvedPath = config.path + ? expandHome(config.path.trim()) + : path.join(getAgentvHome(), 'results', sanitizeRepoSlug(repo)); return { - repo: config.repo.trim(), - path: config.path.trim().replace(/^\/+|\/+$/g, ''), + mode: 'github', + repo, + path: resolvedPath, auto_push: config.auto_push === true, branch_prefix: config.branch_prefix?.trim() || 'eval-results', }; @@ -77,7 +89,7 @@ export function resolveResultsRepoUrl(repo: string): string { return `https://github.com/${repo}.git`; } -export function getResultsRepoCachePaths(repo: string): ResultsRepoCachePaths { +export function getResultsRepoLocalPaths(repo: string): ResultsRepoLocalPaths { const rootDir = path.join(getAgentvHome(), 'cache', 'results-repo', sanitizeRepoSlug(repo)); return { rootDir, @@ -106,12 +118,12 @@ function writePersistedStatus(statusFile: string, status: PersistedStatus): void async function runCommand( executable: string, args: readonly string[], - options?: { cwd?: string; check?: boolean }, + options?: { cwd?: string; check?: boolean; env?: NodeJS.ProcessEnv }, ): Promise<{ stdout: string; stderr: string }> { try { const { stdout, stderr } = await execFileAsync(executable, [...args], { cwd: options?.cwd, - env: process.env, + env: options?.env ?? process.env, }); return { stdout, stderr }; } catch (error) { @@ -126,11 +138,21 @@ async function runCommand( } } +function getGitEnv(): NodeJS.ProcessEnv { + const env: NodeJS.ProcessEnv = {}; + for (const [key, value] of Object.entries(process.env)) { + if (value !== undefined && !(key.startsWith('GIT_') && key !== 'GIT_SSH_COMMAND')) { + env[key] = value; + } + } + return env; +} + async function runGit( args: readonly string[], options?: { cwd?: string; check?: boolean }, ): Promise<{ stdout: string; stderr: string }> { - return runCommand('git', args, options); + return runCommand('git', args, { ...options, env: getGitEnv() }); } async function runGh( @@ -164,14 +186,12 @@ async function resolveDefaultBranch(repoDir: string): Promise<string> { return 'main'; } -async function updateCacheRepo(repoDir: string, baseBranch: string): Promise<void> { +async function fetchResultsRepo(repoDir: string): Promise<void> { await runGit(['fetch', 'origin', '--prune'], { cwd: repoDir }); - await runGit(['checkout', baseBranch], { cwd: repoDir }); - await runGit(['pull', '--ff-only', 'origin', baseBranch], { cwd: repoDir }); } function updateStatusFile(config: ResultsConfig, patch: PersistedStatus): void { - const cachePaths = getResultsRepoCachePaths(config.repo); + const cachePaths = getResultsRepoLocalPaths(config.repo); const current = readPersistedStatus(cachePaths.statusFile); writePersistedStatus(cachePaths.statusFile, { ...current, @@ -181,29 +201,35 @@ function updateStatusFile(config: ResultsConfig, patch: PersistedStatus): void { export async function ensureResultsRepoClone(config: ResultsConfig): Promise<string> { const normalized = normalizeResultsConfig(config); - const cachePaths = getResultsRepoCachePaths(normalized.repo); + const cachePaths = getResultsRepoLocalPaths(normalized.repo); + const cloneDir = normalized.path; mkdirSync(cachePaths.rootDir, { recursive: true }); + mkdirSync(path.dirname(cloneDir), { recursive: true }); - if (!existsSync(cachePaths.repoDir)) { + const cloneMissing = !existsSync(cloneDir); + const gitDir = path.join(cloneDir, '.git'); + const cloneEmpty = !cloneMissing && !existsSync(gitDir) && (await readdir(cloneDir)).length === 0; + + if (cloneMissing || cloneEmpty) { try { await runGit([ 'clone', '--filter=blob:none', resolveResultsRepoUrl(normalized.repo), - cachePaths.repoDir, + cloneDir, ]); - return cachePaths.repoDir; + return cloneDir; } catch (error) { updateStatusFile(normalized, { last_error: withFriendlyGitHubAuthError(error).message }); throw withFriendlyGitHubAuthError(error); } } - if (!existsSync(path.join(cachePaths.repoDir, '.git'))) { - throw new Error(`Results repo cache is not a git repository: ${cachePaths.repoDir}`); + if (!existsSync(gitDir)) { + throw new Error(`Results repo clone path is not a git repository: ${cloneDir}`); } - return cachePaths.repoDir; + return cloneDir; } export function getResultsRepoStatus(config?: ResultsConfig): ResultsRepoStatus { @@ -212,22 +238,22 @@ export function getResultsRepoStatus(config?: ResultsConfig): ResultsRepoStatus configured: false, available: false, repo: '', - cache_dir: '', + local_dir: '', }; } const normalized = normalizeResultsConfig(config); - const cachePaths = getResultsRepoCachePaths(normalized.repo); - const persisted = readPersistedStatus(cachePaths.statusFile); + const localPaths = getResultsRepoLocalPaths(normalized.repo); + const persisted = readPersistedStatus(localPaths.statusFile); return { configured: true, - available: existsSync(cachePaths.repoDir), + available: existsSync(normalized.path), repo: normalized.repo, path: normalized.path, auto_push: normalized.auto_push, branch_prefix: normalized.branch_prefix, - cache_dir: cachePaths.repoDir, + local_dir: normalized.path, last_synced_at: persisted.last_synced_at, last_error: persisted.last_error, }; @@ -238,8 +264,7 @@ export async function syncResultsRepo(config: ResultsConfig): Promise<ResultsRep try { const repoDir = await ensureResultsRepoClone(normalized); - const baseBranch = await resolveDefaultBranch(repoDir); - await updateCacheRepo(repoDir, baseBranch); + await fetchResultsRepo(repoDir); updateStatusFile(normalized, { last_synced_at: new Date().toISOString(), last_error: undefined, @@ -261,7 +286,7 @@ export async function checkoutResultsRepoBranch( const normalized = normalizeResultsConfig(config); const repoDir = await ensureResultsRepoClone(normalized); const baseBranch = await resolveDefaultBranch(repoDir); - await updateCacheRepo(repoDir, baseBranch); + await fetchResultsRepo(repoDir); await runGit(['checkout', '-B', branchName, `origin/${baseBranch}`], { cwd: repoDir }); updateStatusFile(normalized, { last_error: undefined }); return { @@ -278,7 +303,7 @@ export async function prepareResultsRepoBranch( const normalized = normalizeResultsConfig(config); const cloneDir = await ensureResultsRepoClone(normalized); const baseBranch = await resolveDefaultBranch(cloneDir); - await updateCacheRepo(cloneDir, baseBranch); + await fetchResultsRepo(cloneDir); const worktreeRoot = await mkdtemp(path.join(os.tmpdir(), 'agentv-results-repo-')); const worktreeDir = path.join(worktreeRoot, 'repo'); @@ -312,10 +337,7 @@ export async function stageResultsArtifacts(params: { export function resolveResultsRepoRunsDir(config: ResultsConfig): string { const normalized = normalizeResultsConfig(config); - return path.join( - getResultsRepoCachePaths(normalized.repo).repoDir, - ...normalized.path.split('/'), - ); + return path.join(normalized.path, 'runs'); } export async function directorySizeBytes(targetPath: string): Promise<number> { @@ -358,7 +380,7 @@ export async function pushResultsRepoBranch( ): Promise<void> { const normalized = normalizeResultsConfig(config); await runGit(['push', '-u', 'origin', branchName], { - cwd: cwd ?? getResultsRepoCachePaths(normalized.repo).repoDir, + cwd: cwd ?? normalized.path, }); updateStatusFile(normalized, { last_synced_at: new Date().toISOString(), @@ -399,7 +421,7 @@ const DIRECT_PUSH_MAX_RETRIES = 3; /** * Push results directly to the base branch of the results repo. - * Handles non-fast-forward conflicts by pulling with rebase and retrying. + * Handles non-fast-forward conflicts by fetching, rebasing, and retrying. * Returns true if artifacts were pushed, false if no changes were detected. */ export async function directPushResults(params: { @@ -411,9 +433,9 @@ export async function directPushResults(params: { const normalized = normalizeResultsConfig(params.config); const repoDir = await ensureResultsRepoClone(normalized); const baseBranch = await resolveDefaultBranch(repoDir); - await updateCacheRepo(repoDir, baseBranch); + await fetchResultsRepo(repoDir); - const destinationDir = path.join(repoDir, normalized.path, params.destinationPath); + const destinationDir = path.join(repoDir, 'runs', params.destinationPath); await stageResultsArtifacts({ repoDir, sourceDir: params.sourceDir, @@ -429,11 +451,20 @@ export async function directPushResults(params: { return false; } - await runGit(['commit', '-m', params.commitMessage], { cwd: repoDir }); + await runGit( + [ + 'commit', + '-m', + params.commitMessage, + '-m', + `Agentv-Run: ${buildGitRunId(params.destinationPath)}`, + ], + { cwd: repoDir }, + ); for (let attempt = 1; attempt <= DIRECT_PUSH_MAX_RETRIES; attempt++) { try { - await runGit(['push', 'origin', baseBranch], { cwd: repoDir }); + await runGit(['push', 'origin', `HEAD:${baseBranch}`], { cwd: repoDir }); updateStatusFile(normalized, { last_synced_at: new Date().toISOString(), last_error: undefined, @@ -442,7 +473,8 @@ export async function directPushResults(params: { } catch (error) { const message = error instanceof Error ? error.message : String(error); if (attempt < DIRECT_PUSH_MAX_RETRIES && message.includes('non-fast-forward')) { - await runGit(['pull', '--rebase', 'origin', baseBranch], { cwd: repoDir }); + await fetchResultsRepo(repoDir); + await runGit(['rebase', `origin/${baseBranch}`], { cwd: repoDir }); } else { throw error; } @@ -451,3 +483,217 @@ export async function directPushResults(params: { return false; } + +export interface GitListedRun { + run_id: string; + experiment: string; + timestamp: string; + pass_rate?: number; + target?: string; + manifest_path: string; + benchmark_path: string; + display_name: string; + test_count: number; + avg_score: number; + size_bytes: number; +} + +type GitBatchBlob = { + readonly size: number; + readonly content: Buffer; +}; + +type GitRunBenchmark = { + readonly metadata?: { + readonly timestamp?: string; + readonly experiment?: string; + readonly targets?: readonly string[]; + readonly tests_run?: readonly string[]; + }; + readonly run_summary?: Record< + string, + { + readonly pass_rate?: { readonly mean?: number }; + } + >; +}; + +function buildGitRunId(relativeRunPath: string): string { + const normalized = relativeRunPath.split(path.sep).join('/'); + const segments = normalized.split('/').filter(Boolean); + if (segments.length >= 2) { + const experiment = segments.slice(0, -1).join('/'); + const timestamp = segments.at(-1); + if (experiment === 'default') { + return timestamp ?? normalized; + } + return `${experiment}::${timestamp}`; + } + return segments[0] ?? relativeRunPath; +} + +function getRunExperiment(runId: string, benchmark: GitRunBenchmark): string { + const experiment = benchmark.metadata?.experiment?.trim(); + if (experiment) { + return experiment; + } + + const separatorIndex = runId.lastIndexOf('::'); + return separatorIndex === -1 ? 'default' : runId.slice(0, separatorIndex); +} + +function computeAveragePassRate(runSummary: GitRunBenchmark['run_summary']): number | undefined { + if (!runSummary) { + return undefined; + } + + const passRates = Object.values(runSummary) + .map((summary) => summary.pass_rate?.mean) + .filter((value): value is number => typeof value === 'number' && Number.isFinite(value)); + + if (passRates.length === 0) { + return undefined; + } + + return passRates.reduce((sum, value) => sum + value, 0) / passRates.length; +} + +async function runGitBatch(repoDir: string, input: string): Promise<Buffer> { + return new Promise((resolve, reject) => { + const child = spawn('git', ['cat-file', '--batch'], { + cwd: repoDir, + env: getGitEnv(), + stdio: ['pipe', 'pipe', 'pipe'], + }); + + const stdoutChunks: Buffer[] = []; + const stderrChunks: Buffer[] = []; + + child.stdout.on('data', (chunk: Buffer | string) => { + stdoutChunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + }); + child.stderr.on('data', (chunk: Buffer | string) => { + stderrChunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + }); + child.on('error', (error) => reject(withFriendlyGitHubAuthError(error))); + child.on('close', (code) => { + if (code === 0) { + resolve(Buffer.concat(stdoutChunks)); + return; + } + + const stderr = Buffer.concat(stderrChunks).toString('utf8').trim(); + reject( + withFriendlyGitHubAuthError( + stderr.length > 0 ? new Error(stderr) : new Error('git cat-file failed'), + ), + ); + }); + + child.stdin.end(input); + }); +} + +function parseGitBatchBlobs(output: Buffer): GitBatchBlob[] { + const blobs: GitBatchBlob[] = []; + let offset = 0; + + while (offset < output.length) { + const headerEnd = output.indexOf(0x0a, offset); + if (headerEnd === -1) { + throw new Error('Malformed git cat-file output: missing header terminator'); + } + + const header = output.subarray(offset, headerEnd).toString('utf8'); + offset = headerEnd + 1; + + if (header.length === 0) { + continue; + } + + const missingMatch = /^(.*) missing$/.exec(header); + if (missingMatch) { + continue; + } + + const headerMatch = /^(.*) (\w+) (\d+)$/.exec(header); + if (!headerMatch) { + throw new Error(`Malformed git cat-file header: ${header}`); + } + + const [, objectRef, objectType, sizeText] = headerMatch; + if (objectType !== 'blob') { + throw new Error(`Unsupported git object type for ${objectRef}: ${objectType}`); + } + + const size = Number.parseInt(sizeText, 10); + const contentEnd = offset + size; + if (contentEnd > output.length) { + throw new Error(`Malformed git cat-file output for ${objectRef}: truncated blob content`); + } + + blobs.push({ + size, + content: output.subarray(offset, contentEnd), + }); + offset = contentEnd; + + if (offset < output.length && output[offset] === 0x0a) { + offset += 1; + } + } + + return blobs; +} + +export async function listGitRuns(repoDir: string, ref = 'origin/main'): Promise<GitListedRun[]> { + const { stdout: treeOut } = await runGit(['ls-tree', '-r', '--name-only', ref, 'runs'], { + cwd: repoDir, + }); + + const benchmarkPaths = treeOut + .split(/\r?\n/) + .map((line) => line.trim()) + .filter((line) => line.endsWith('/benchmark.json')); + if (benchmarkPaths.length === 0) { + return []; + } + + const batchInput = `${benchmarkPaths.map((benchmarkPath) => `${ref}:${benchmarkPath}`).join('\n')}\n`; + const blobs = parseGitBatchBlobs(await runGitBatch(repoDir, batchInput)); + if (blobs.length !== benchmarkPaths.length) { + throw new Error( + `Expected ${benchmarkPaths.length} git blobs but received ${blobs.length} while listing results runs`, + ); + } + + const runs = blobs.flatMap((blob, index): GitListedRun[] => { + const benchmarkPath = benchmarkPaths[index]; + const benchmark = JSON.parse(blob.content.toString('utf8')) as GitRunBenchmark; + const runDir = path.posix.dirname(benchmarkPath); + const relativeRunPath = path.posix.relative('runs', runDir); + const runId = buildGitRunId(relativeRunPath); + const timestamp = benchmark.metadata?.timestamp?.trim() || path.posix.basename(runDir); + const targets = benchmark.metadata?.targets ?? []; + const passRate = computeAveragePassRate(benchmark.run_summary); + + return [ + { + run_id: runId, + experiment: getRunExperiment(runId, benchmark), + timestamp, + ...(passRate !== undefined && { pass_rate: passRate }), + ...(targets.length === 1 && targets[0] ? { target: targets[0] } : {}), + manifest_path: path.posix.join(runDir, 'index.jsonl'), + benchmark_path: benchmarkPath, + display_name: path.posix.basename(runDir), + test_count: benchmark.metadata?.tests_run?.length ?? 0, + avg_score: 0, + size_bytes: blob.size, + }, + ]; + }); + + runs.sort((a, b) => b.timestamp.localeCompare(a.timestamp)); + return runs; +} diff --git a/packages/core/src/evaluation/validation/config-validator.ts b/packages/core/src/evaluation/validation/config-validator.ts index 5196feaf..38968f77 100644 --- a/packages/core/src/evaluation/validation/config-validator.ts +++ b/packages/core/src/evaluation/validation/config-validator.ts @@ -78,22 +78,48 @@ export async function validateConfigFile(filePath: string): Promise<ValidationRe }); } else { const resultsRecord = results as Record<string, unknown>; - if (typeof resultsRecord.repo !== 'string' || resultsRecord.repo.trim().length === 0) { + if (resultsRecord.mode !== 'github') { errors.push({ severity: 'error', filePath, - location: 'results.repo', - message: "Field 'results.repo' must be a non-empty string", + location: 'results.mode', + message: "Field 'results.mode' must be 'github'", }); } - if (typeof resultsRecord.path !== 'string' || resultsRecord.path.trim().length === 0) { + if (typeof resultsRecord.repo !== 'string' || resultsRecord.repo.trim().length === 0) { errors.push({ severity: 'error', filePath, - location: 'results.path', - message: "Field 'results.path' must be a non-empty string", + location: 'results.repo', + message: "Field 'results.repo' must be a non-empty string", }); } + if (resultsRecord.path !== undefined) { + if (typeof resultsRecord.path !== 'string' || resultsRecord.path.trim().length === 0) { + errors.push({ + severity: 'error', + filePath, + location: 'results.path', + message: "Field 'results.path' must be a non-empty string", + }); + } else { + const p = resultsRecord.path.trim(); + const isFilesystemPath = + p.startsWith('/') || + p.startsWith('~/') || + p.startsWith('~\\') || + p === '~' || + /^[A-Za-z]:[/\\]/.test(p); + if (!isFilesystemPath) { + errors.push({ + severity: 'error', + filePath, + location: 'results.path', + message: `'results.path' must be an absolute or home-relative filesystem path (e.g., ~/data/agentv-results). Found: '${p}'. Remove 'path' to use the default.`, + }); + } + } + } if (resultsRecord.auto_push !== undefined && typeof resultsRecord.auto_push !== 'boolean') { errors.push({ severity: 'error', diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index aab188c8..aa43c2a9 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -61,7 +61,7 @@ export { toSnakeCaseDeep, toCamelCaseDeep } from './evaluation/case-conversion.j export { ensureResultsRepoClone, syncResultsRepo, - getResultsRepoCachePaths, + getResultsRepoLocalPaths, getResultsRepoStatus, normalizeResultsConfig, resolveResultsRepoRunsDir, @@ -74,9 +74,11 @@ export { pushResultsRepoBranch, createDraftResultsPr, directPushResults, + listGitRuns, type CheckedOutResultsRepoBranch, + type GitListedRun, type PreparedResultsRepoBranch, - type ResultsRepoCachePaths, + type ResultsRepoLocalPaths, type ResultsRepoStatus, } from './evaluation/results-repo.js'; export { diff --git a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts index b8d32524..6918f56e 100644 --- a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts +++ b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts @@ -9,245 +9,295 @@ import { describe, expect, it } from 'bun:test'; import path from 'node:path'; import { evaluate } from '../../src/evaluation/evaluate.js'; +const PROGRAMMATIC_API_TIMEOUT_MS = 15_000; + describe('evaluate() — programmatic API extensions', () => { // --------------------------------------------------------------------------- // budgetUsd // --------------------------------------------------------------------------- - it('accepts budgetUsd and passes it to the orchestrator', async () => { - const { summary } = await evaluate({ - tests: [ - { - id: 'budget-test', - input: 'hello', - assert: [{ type: 'contains', value: 'hello' }], - }, - ], - target: { name: 'default', provider: 'mock', response: 'hello world' }, - budgetUsd: 10.0, - }); - expect(summary.passed).toBe(1); - }); + it( + 'accepts budgetUsd and passes it to the orchestrator', + async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'budget-test', + input: 'hello', + assert: [{ type: 'contains', value: 'hello' }], + }, + ], + target: { name: 'default', provider: 'mock', response: 'hello world' }, + budgetUsd: 10.0, + }); + expect(summary.passed).toBe(1); + }, + PROGRAMMATIC_API_TIMEOUT_MS, + ); // --------------------------------------------------------------------------- // turns + mode: 'conversation' // --------------------------------------------------------------------------- - it('accepts turns with explicit conversation mode', async () => { - const { summary, results } = await evaluate({ - tests: [ - { - id: 'conversation-explicit', - mode: 'conversation', - turns: [ - { - input: 'Hello', - assert: [{ type: 'contains', value: 'mock' }], - }, - { - input: 'How are you?', - assert: [{ type: 'contains', value: 'mock' }], - }, - ], - }, - ], - target: { name: 'default', provider: 'mock', response: 'mock response' }, - }); - expect(summary.total).toBe(1); - expect(results.length).toBe(1); - }); - - it('infers conversation mode when turns[] is provided without explicit mode', async () => { - const { summary } = await evaluate({ - tests: [ - { - id: 'conversation-inferred', - turns: [ - { - input: 'First turn', - assert: [{ type: 'contains', value: 'mock' }], - }, - ], - }, - ], - target: { name: 'default', provider: 'mock', response: 'mock response' }, - }); - expect(summary.total).toBe(1); - }); - - it('supports expectedOutput on individual turns', async () => { - const { summary } = await evaluate({ - tests: [ - { - id: 'turn-expected-output', - turns: [ - { - input: 'Say hello', - expectedOutput: 'Hello!', - assert: [{ type: 'contains', value: 'mock' }], - }, - ], - }, - ], - target: { name: 'default', provider: 'mock', response: 'mock response' }, - }); - expect(summary.total).toBe(1); - }); - - it('supports message array input in turns', async () => { - const { summary } = await evaluate({ - tests: [ - { - id: 'turn-message-array', - turns: [ - { - input: [ - { role: 'system', content: 'You are helpful' }, - { role: 'user', content: 'Hello' }, - ], - assert: [{ type: 'contains', value: 'mock' }], - }, - ], - }, - ], - target: { name: 'default', provider: 'mock', response: 'mock response' }, - }); - expect(summary.total).toBe(1); - }); + it( + 'accepts turns with explicit conversation mode', + async () => { + const { summary, results } = await evaluate({ + tests: [ + { + id: 'conversation-explicit', + mode: 'conversation', + turns: [ + { + input: 'Hello', + assert: [{ type: 'contains', value: 'mock' }], + }, + { + input: 'How are you?', + assert: [{ type: 'contains', value: 'mock' }], + }, + ], + }, + ], + target: { name: 'default', provider: 'mock', response: 'mock response' }, + }); + expect(summary.total).toBe(1); + expect(results.length).toBe(1); + }, + PROGRAMMATIC_API_TIMEOUT_MS, + ); + + it( + 'infers conversation mode when turns[] is provided without explicit mode', + async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'conversation-inferred', + turns: [ + { + input: 'First turn', + assert: [{ type: 'contains', value: 'mock' }], + }, + ], + }, + ], + target: { name: 'default', provider: 'mock', response: 'mock response' }, + }); + expect(summary.total).toBe(1); + }, + PROGRAMMATIC_API_TIMEOUT_MS, + ); + + it( + 'supports expectedOutput on individual turns', + async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'turn-expected-output', + turns: [ + { + input: 'Say hello', + expectedOutput: 'Hello!', + assert: [{ type: 'contains', value: 'mock' }], + }, + ], + }, + ], + target: { name: 'default', provider: 'mock', response: 'mock response' }, + }); + expect(summary.total).toBe(1); + }, + PROGRAMMATIC_API_TIMEOUT_MS, + ); + + it( + 'supports message array input in turns', + async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'turn-message-array', + turns: [ + { + input: [ + { role: 'system', content: 'You are helpful' }, + { role: 'user', content: 'Hello' }, + ], + assert: [{ type: 'contains', value: 'mock' }], + }, + ], + }, + ], + target: { name: 'default', provider: 'mock', response: 'mock response' }, + }); + expect(summary.total).toBe(1); + }, + PROGRAMMATIC_API_TIMEOUT_MS, + ); // --------------------------------------------------------------------------- // aggregation // --------------------------------------------------------------------------- - it('accepts aggregation on conversation tests', async () => { - const { summary } = await evaluate({ - tests: [ - { - id: 'aggregation-min', - turns: [ - { - input: 'Turn 1', - assert: [{ type: 'contains', value: 'mock' }], - }, - { - input: 'Turn 2', - assert: [{ type: 'contains', value: 'mock' }], - }, - ], - aggregation: 'min', - }, - ], - target: { name: 'default', provider: 'mock', response: 'mock response' }, - }); - expect(summary.total).toBe(1); - }); + it( + 'accepts aggregation on conversation tests', + async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'aggregation-min', + turns: [ + { + input: 'Turn 1', + assert: [{ type: 'contains', value: 'mock' }], + }, + { + input: 'Turn 2', + assert: [{ type: 'contains', value: 'mock' }], + }, + ], + aggregation: 'min', + }, + ], + target: { name: 'default', provider: 'mock', response: 'mock response' }, + }); + expect(summary.total).toBe(1); + }, + PROGRAMMATIC_API_TIMEOUT_MS, + ); // --------------------------------------------------------------------------- // beforeAll // --------------------------------------------------------------------------- - it('accepts beforeAll as a string', async () => { - // beforeAll requires a workspace to execute in; without repos it just attaches - // the hook config. This test verifies the type is accepted without throwing. - const { summary } = await evaluate({ - tests: [ - { - id: 'before-all-string', - input: 'hello', - assert: [{ type: 'contains', value: 'test' }], - }, - ], - target: { name: 'default', provider: 'mock', response: 'test output' }, - beforeAll: 'echo "setup complete"', - }); - expect(summary.total).toBe(1); - }); - - it('accepts beforeAll as a string array', async () => { - const { summary } = await evaluate({ - tests: [ - { - id: 'before-all-array', - input: 'hello', - assert: [{ type: 'contains', value: 'test' }], - }, - ], - target: { name: 'default', provider: 'mock', response: 'test output' }, - beforeAll: ['echo', 'setup complete'], - }); - expect(summary.total).toBe(1); - }); + it( + 'accepts beforeAll as a string', + async () => { + // beforeAll requires a workspace to execute in; without repos it just attaches + // the hook config. This test verifies the type is accepted without throwing. + const { summary } = await evaluate({ + tests: [ + { + id: 'before-all-string', + input: 'hello', + assert: [{ type: 'contains', value: 'test' }], + }, + ], + target: { name: 'default', provider: 'mock', response: 'test output' }, + beforeAll: 'echo "setup complete"', + }); + expect(summary.total).toBe(1); + }, + PROGRAMMATIC_API_TIMEOUT_MS, + ); + + it( + 'accepts beforeAll as a string array', + async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'before-all-array', + input: 'hello', + assert: [{ type: 'contains', value: 'test' }], + }, + ], + target: { name: 'default', provider: 'mock', response: 'test output' }, + beforeAll: ['echo', 'setup complete'], + }); + expect(summary.total).toBe(1); + }, + PROGRAMMATIC_API_TIMEOUT_MS, + ); // --------------------------------------------------------------------------- // Combined usage // --------------------------------------------------------------------------- - it('supports all new fields together', async () => { - const { summary } = await evaluate({ - tests: [ - { - id: 'combined-test', - turns: [ - { - input: 'Hello', - expectedOutput: 'Hi there', - assert: [{ type: 'contains', value: 'mock' }], - }, - { - input: 'Goodbye', - assert: [{ type: 'contains', value: 'mock' }], - }, - ], - aggregation: 'mean', - }, - ], - target: { name: 'default', provider: 'mock', response: 'mock response' }, - budgetUsd: 5.0, - beforeAll: 'echo "setup"', - }); - expect(summary.total).toBe(1); - }); + it( + 'supports all new fields together', + async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'combined-test', + turns: [ + { + input: 'Hello', + expectedOutput: 'Hi there', + assert: [{ type: 'contains', value: 'mock' }], + }, + { + input: 'Goodbye', + assert: [{ type: 'contains', value: 'mock' }], + }, + ], + aggregation: 'mean', + }, + ], + target: { name: 'default', provider: 'mock', response: 'mock response' }, + budgetUsd: 5.0, + beforeAll: 'echo "setup"', + }); + expect(summary.total).toBe(1); + }, + PROGRAMMATIC_API_TIMEOUT_MS, + ); // --------------------------------------------------------------------------- // Backwards compatibility: input still works as before // --------------------------------------------------------------------------- - it('still works with standard single-turn input', async () => { - const { summary } = await evaluate({ - tests: [ - { - id: 'standard-input', - input: 'hello', - assert: [{ type: 'contains', value: 'hello' }], - }, - ], - target: { name: 'default', provider: 'mock', response: 'hello world' }, - }); - expect(summary.passed).toBe(1); - }); - - it('uses inline target from a TypeScript specFile', async () => { - const specFile = path.join(import.meta.dir, 'loaders', 'fixtures', 'default-export.eval.ts'); - - const { summary } = await evaluate({ - specFile, - }); - - expect(summary.total).toBe(1); - expect(summary.passed).toBe(1); - }); + it( + 'still works with standard single-turn input', + async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'standard-input', + input: 'hello', + assert: [{ type: 'contains', value: 'hello' }], + }, + ], + target: { name: 'default', provider: 'mock', response: 'hello world' }, + }); + expect(summary.passed).toBe(1); + }, + PROGRAMMATIC_API_TIMEOUT_MS, + ); + + it( + 'uses inline target from a TypeScript specFile', + async () => { + const specFile = path.join(import.meta.dir, 'loaders', 'fixtures', 'default-export.eval.ts'); + + const { summary } = await evaluate({ + specFile, + }); + + expect(summary.total).toBe(1); + expect(summary.passed).toBe(1); + }, + PROGRAMMATIC_API_TIMEOUT_MS, + ); // --------------------------------------------------------------------------- // Validation // --------------------------------------------------------------------------- - it('throws when input is missing on a non-conversation test', async () => { - expect(() => - evaluate({ - // biome-ignore lint/suspicious/noExplicitAny: intentionally testing invalid input - tests: [{ id: 'no-input', assert: [{ type: 'contains', value: 'x' }] } as any], - target: { name: 'default', provider: 'mock', response: 'hello' }, - }), - ).toThrow("Test 'no-input': input is required for non-conversation tests"); - }); + it( + 'throws when input is missing on a non-conversation test', + async () => { + expect(() => + evaluate({ + // biome-ignore lint/suspicious/noExplicitAny: intentionally testing invalid input + tests: [{ id: 'no-input', assert: [{ type: 'contains', value: 'x' }] } as any], + target: { name: 'default', provider: 'mock', response: 'hello' }, + }), + ).toThrow("Test 'no-input': input is required for non-conversation tests"); + }, + PROGRAMMATIC_API_TIMEOUT_MS, + ); }); diff --git a/packages/core/test/evaluation/loaders/config-loader.test.ts b/packages/core/test/evaluation/loaders/config-loader.test.ts index 3846b471..e97b03a4 100644 --- a/packages/core/test/evaluation/loaders/config-loader.test.ts +++ b/packages/core/test/evaluation/loaders/config-loader.test.ts @@ -137,11 +137,12 @@ describe('extractTrialsConfig', () => { }); describe('parseResultsConfig', () => { - it('parses valid results config', () => { + it('parses valid results config with explicit path', () => { const result = parseResultsConfig( { + mode: 'github', repo: 'EntityProcess/agentv-evals', - path: 'autopilot-dev/runs', + path: '~/data/agentv-results', auto_push: true, branch_prefix: 'eval-results', }, @@ -149,18 +150,83 @@ describe('parseResultsConfig', () => { ); expect(result).toEqual({ + mode: 'github', repo: 'EntityProcess/agentv-evals', - path: 'autopilot-dev/runs', + path: '~/data/agentv-results', auto_push: true, branch_prefix: 'eval-results', }); }); + it('parses valid results config without path (defaults omitted)', () => { + const result = parseResultsConfig( + { + mode: 'github', + repo: 'EntityProcess/agentv-evals', + }, + '/tmp/.agentv/config.yaml', + ); + + expect(result).toEqual({ + mode: 'github', + repo: 'EntityProcess/agentv-evals', + }); + }); + + it('returns undefined when mode is missing', () => { + const result = parseResultsConfig( + { + repo: 'EntityProcess/agentv-evals', + }, + '/tmp/.agentv/config.yaml', + ); + + expect(result).toBeUndefined(); + }); + + it('returns undefined when mode is not github', () => { + const result = parseResultsConfig( + { + mode: 'other', + repo: 'EntityProcess/agentv-evals', + }, + '/tmp/.agentv/config.yaml', + ); + + expect(result).toBeUndefined(); + }); + + it('returns undefined when path looks like a repo subdirectory', () => { + const result = parseResultsConfig( + { + mode: 'github', + repo: 'EntityProcess/agentv-evals', + path: 'autopilot-dev/runs', + }, + '/tmp/.agentv/config.yaml', + ); + + expect(result).toBeUndefined(); + }); + + it('accepts absolute path', () => { + const result = parseResultsConfig( + { + mode: 'github', + repo: 'EntityProcess/agentv-evals', + path: '/home/user/data/results', + }, + '/tmp/.agentv/config.yaml', + ); + + expect(result?.path).toBe('/home/user/data/results'); + }); + it('returns undefined when repo is empty', () => { const result = parseResultsConfig( { + mode: 'github', repo: '', - path: 'autopilot-dev/runs', }, '/tmp/.agentv/config.yaml', ); @@ -171,8 +237,8 @@ describe('parseResultsConfig', () => { it('returns undefined when repo is not a string', () => { const result = parseResultsConfig( { + mode: 'github', repo: 123, - path: 'autopilot-dev/runs', }, '/tmp/.agentv/config.yaml', ); diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index daac1ee1..d4cc49e9 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -3082,9 +3082,13 @@ fs.writeFileSync(path.join(payload.workspace_path, 'hook.txt'), payload.test_id responses: [{ output: [{ role: 'assistant', content: [{ type: 'text', text: 'answer' }] }] }], }); - // Use YAML workspace.path (not CLI --workspace) with type: git repos. - // repo-a exists → should be reused. repo-b is missing but uses a fake URL → should fail clone. - // Since repo-a is reused (skipped) and repo-b clone fails, this proves per-repo logic works. + const missingRepoBSource = path.join(testDir, 'missing-repo-b-source'); + + // Use YAML workspace.path (not CLI --workspace) with mixed repo states. + // repo-a exists → should be reused. repo-b is missing and points to a missing local source + // → should fail immediately. Since repo-a is reused (skipped) and repo-b materialization + // fails fast, this proves the per-repo existence check works without depending on network + // timeouts from cloning fake remotes. const evalCase: EvalTest = { ...baseTestCase, workspace: { @@ -3098,15 +3102,14 @@ fs.writeFileSync(path.join(payload.workspace_path, 'hook.txt'), payload.test_id }, { path: 'repo-b', - source: { type: 'git', url: 'https://github.com/example/repo-b.git' }, - checkout: { ref: 'main' }, + source: { type: 'local', path: missingRepoBSource }, }, ], }, }; - // repo-b clone will fail (fake URL), which proves repo-a was skipped (per-repo check) - // and only repo-b was attempted + // repo-b materialization fails immediately, which proves repo-a was skipped + // and only repo-b was attempted. await expect( runEvaluation({ testFilePath: 'in-memory.yaml', @@ -3117,7 +3120,7 @@ fs.writeFileSync(path.join(payload.workspace_path, 'hook.txt'), payload.test_id evalCases: [evalCase], keepWorkspaces: true, }), - ).rejects.toThrow('Failed to materialize repos'); + ).rejects.toThrow('Local repo path validation failed'); // repo-a marker should still exist (not deleted by static workspace cleanup) await fsAccess(path.join(repoADir, 'marker.txt')); diff --git a/packages/core/test/evaluation/results-repo.test.ts b/packages/core/test/evaluation/results-repo.test.ts new file mode 100644 index 00000000..211f2e98 --- /dev/null +++ b/packages/core/test/evaluation/results-repo.test.ts @@ -0,0 +1,294 @@ +import { execSync } from 'node:child_process'; +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; + +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; + +import type { ResultsConfig } from '../../src/evaluation/loaders/config-loader.js'; +import { + directPushResults, + ensureResultsRepoClone, + listGitRuns, + syncResultsRepo, +} from '../../src/evaluation/results-repo.js'; + +function cleanGitEnv(): Record<string, string> { + const env: Record<string, string> = {}; + for (const [key, value] of Object.entries(process.env)) { + if (value !== undefined && !(key.startsWith('GIT_') && key !== 'GIT_SSH_COMMAND')) { + env[key] = value; + } + } + return env; +} + +function git(cmd: string, cwd: string): string { + return execSync(cmd, { + cwd, + env: cleanGitEnv(), + stdio: ['ignore', 'pipe', 'pipe'], + }) + .toString() + .trim(); +} + +function createResultsConfig(repoDir: string, cloneDir: string): ResultsConfig { + return { + mode: 'github', + repo: `file://${repoDir}`, + path: cloneDir, + auto_push: true, + }; +} + +function initializeRemoteRepo(rootDir: string): { remoteDir: string; seedDir: string } { + const remoteDir = path.join(rootDir, 'results-remote.git'); + git(`git init --bare --initial-branch=main --quiet "${remoteDir}"`, rootDir); + + const seedDir = path.join(rootDir, 'results-seed'); + git(`git clone --quiet "${remoteDir}" "${seedDir}"`, rootDir); + git('git config user.email "test@example.com"', seedDir); + git('git config user.name "Test User"', seedDir); + writeFileSync(path.join(seedDir, 'README.md'), '# results repo\n'); + git('git add README.md && git commit --quiet -m "seed repo"', seedDir); + git('git push --quiet origin main', seedDir); + + return { remoteDir, seedDir }; +} + +function writeRunArtifacts(runDir: string, experiment: string, timestamp: string): void { + mkdirSync(runDir, { recursive: true }); + writeFileSync(path.join(runDir, 'index.jsonl'), '{"test_id":"alpha"}\n'); + writeFileSync( + path.join(runDir, 'benchmark.json'), + JSON.stringify( + { + metadata: { + timestamp, + experiment, + targets: ['gpt-4o'], + tests_run: ['alpha'], + }, + run_summary: { + 'gpt-4o': { + pass_rate: { mean: 1 }, + }, + }, + }, + null, + 2, + ), + ); +} + +describe('listGitRuns', () => { + let repoDir: string; + + beforeEach(() => { + repoDir = mkdtempSync(path.join(os.tmpdir(), 'agentv-results-repo-test-')); + git('git init', repoDir); + git('git config user.email "test@example.com"', repoDir); + git('git config user.name "Test User"', repoDir); + }); + + afterEach(() => { + rmSync(repoDir, { recursive: true, force: true }); + }); + + it('returns committed runs derived from benchmark.json blobs', async () => { + const defaultRunDir = path.join(repoDir, 'runs', 'default', '2026-05-20T10-00-00-000Z'); + mkdirSync(defaultRunDir, { recursive: true }); + writeFileSync( + path.join(defaultRunDir, 'benchmark.json'), + JSON.stringify( + { + metadata: { + timestamp: '2026-05-20T10:00:00.000Z', + targets: ['gpt-4o'], + tests_run: ['alpha', 'beta'], + }, + run_summary: { + 'gpt-4o': { + pass_rate: { mean: 0.5 }, + }, + }, + }, + null, + 2, + ), + ); + + const experimentRunDir = path.join(repoDir, 'runs', 'with-skills', '2026-05-21T11-00-00-000Z'); + mkdirSync(experimentRunDir, { recursive: true }); + writeFileSync( + path.join(experimentRunDir, 'benchmark.json'), + JSON.stringify( + { + metadata: { + timestamp: '2026-05-21T11:00:00.000Z', + experiment: 'with-skills', + targets: ['claude-sonnet', 'gpt-4o'], + tests_run: ['alpha', 'beta', 'gamma'], + }, + run_summary: { + 'claude-sonnet': { + pass_rate: { mean: 1 }, + }, + 'gpt-4o': { + pass_rate: { mean: 0.5 }, + }, + }, + }, + null, + 2, + ), + ); + + git('git add runs && git commit -m "seed runs"', repoDir); + + const runs = await listGitRuns(repoDir, 'HEAD'); + + expect(runs).toHaveLength(2); + expect(runs.map((run) => run.run_id)).toEqual([ + 'with-skills::2026-05-21T11-00-00-000Z', + '2026-05-20T10-00-00-000Z', + ]); + expect(runs[0]).toMatchObject({ + experiment: 'with-skills', + timestamp: '2026-05-21T11:00:00.000Z', + display_name: '2026-05-21T11-00-00-000Z', + manifest_path: 'runs/with-skills/2026-05-21T11-00-00-000Z/index.jsonl', + benchmark_path: 'runs/with-skills/2026-05-21T11-00-00-000Z/benchmark.json', + test_count: 3, + pass_rate: 0.75, + avg_score: 0, + }); + expect(runs[0].target).toBeUndefined(); + expect(runs[1]).toMatchObject({ + experiment: 'default', + target: 'gpt-4o', + manifest_path: 'runs/default/2026-05-20T10-00-00-000Z/index.jsonl', + test_count: 2, + pass_rate: 0.5, + }); + expect(runs[0].size_bytes).toBeGreaterThan(0); + }); + + it('returns an empty list when the ref has no committed runs', async () => { + writeFileSync(path.join(repoDir, 'README.md'), '# test\n'); + git('git add README.md && git commit -m "initial"', repoDir); + + await expect(listGitRuns(repoDir, 'HEAD')).resolves.toEqual([]); + }); + + it('ignores inherited git hook environment variables', async () => { + const runDir = path.join(repoDir, 'runs', 'default', '2026-05-20T10-00-00-000Z'); + mkdirSync(runDir, { recursive: true }); + writeFileSync( + path.join(runDir, 'benchmark.json'), + JSON.stringify( + { + metadata: { + timestamp: '2026-05-20T10:00:00.000Z', + targets: ['gpt-4o'], + tests_run: ['alpha'], + }, + run_summary: { + 'gpt-4o': { + pass_rate: { mean: 1 }, + }, + }, + }, + null, + 2, + ), + ); + git('git add runs && git commit -m "seed run"', repoDir); + + const previousGitDir = process.env.GIT_DIR; + const previousGitWorkTree = process.env.GIT_WORK_TREE; + process.env.GIT_DIR = '/tmp/not-the-test-repo'; + process.env.GIT_WORK_TREE = '/tmp/not-the-test-repo'; + + try { + const runs = await listGitRuns(repoDir, 'HEAD'); + expect(runs).toHaveLength(1); + expect(runs[0].run_id).toBe('2026-05-20T10-00-00-000Z'); + } finally { + if (previousGitDir === undefined) { + process.env.GIT_DIR = undefined; + } else { + process.env.GIT_DIR = previousGitDir; + } + + if (previousGitWorkTree === undefined) { + process.env.GIT_WORK_TREE = undefined; + } else { + process.env.GIT_WORK_TREE = previousGitWorkTree; + } + } + }); +}); + +describe('results repo write path', () => { + let rootDir: string; + + beforeEach(() => { + rootDir = mkdtempSync(path.join(os.tmpdir(), 'agentv-results-repo-write-test-')); + }); + + afterEach(() => { + rmSync(rootDir, { recursive: true, force: true }); + }); + + it('commits pushed runs into the configured clone with an Agentv-Run trailer', async () => { + const { remoteDir } = initializeRemoteRepo(rootDir); + const cloneDir = path.join(rootDir, 'results-clone'); + const sourceDir = path.join(rootDir, 'source-run'); + const runTimestamp = '2026-05-22T10-00-00-000Z'; + const destinationPath = path.join('with-skills', runTimestamp); + writeRunArtifacts(sourceDir, 'with-skills', '2026-05-22T10:00:00.000Z'); + + const pushed = await directPushResults({ + config: createResultsConfig(remoteDir, cloneDir), + sourceDir, + destinationPath, + commitMessage: 'feat(results): with-skills - 1/1 PASS (1.000)', + }); + + expect(pushed).toBe(true); + expect(git('git rev-parse --show-toplevel', cloneDir)).toBe(cloneDir); + expect(git('git log -1 --pretty=%B', cloneDir)).toContain( + `Agentv-Run: with-skills::${runTimestamp}`, + ); + expect(git(`git --git-dir "${remoteDir}" log -1 --pretty=%B main`, rootDir)).toContain( + `Agentv-Run: with-skills::${runTimestamp}`, + ); + + const runs = await listGitRuns(cloneDir, 'HEAD'); + expect(runs).toHaveLength(1); + expect(runs[0].run_id).toBe(`with-skills::${runTimestamp}`); + }, 20000); + + it('syncResultsRepo refreshes refs without checking out the base branch', async () => { + const { remoteDir, seedDir } = initializeRemoteRepo(rootDir); + const cloneDir = path.join(rootDir, 'results-clone'); + const config = createResultsConfig(remoteDir, cloneDir); + + await ensureResultsRepoClone(config); + git('git config user.email "test@example.com"', cloneDir); + git('git config user.name "Test User"', cloneDir); + git('git checkout -b scratch', cloneDir); + + writeFileSync(path.join(seedDir, 'CHANGELOG.md'), 'remote update\n'); + git('git add CHANGELOG.md && git commit --quiet -m "remote update"', seedDir); + git('git push --quiet origin main', seedDir); + const remoteMain = git(`git --git-dir "${remoteDir}" rev-parse main`, rootDir); + + await syncResultsRepo(config); + + expect(git('git branch --show-current', cloneDir)).toBe('scratch'); + expect(git('git rev-parse origin/main', cloneDir)).toBe(remoteMain); + }, 20000); +}); diff --git a/packages/core/test/evaluation/validation/config-validator.test.ts b/packages/core/test/evaluation/validation/config-validator.test.ts index f2adaeef..7aa41b91 100644 --- a/packages/core/test/evaluation/validation/config-validator.test.ts +++ b/packages/core/test/evaluation/validation/config-validator.test.ts @@ -51,8 +51,8 @@ describe('validateConfigFile', () => { await writeFile( filePath, `results: + mode: github repo: EntityProcess/agentv-evals - path: autopilot-dev/runs auto_push: true branch_prefix: eval-results `, @@ -64,6 +64,42 @@ describe('validateConfigFile', () => { expect(result.errors).toHaveLength(0); }); + it('errors on missing results.mode', async () => { + const filePath = path.join(tempDir, 'config-results-no-mode.yaml'); + await writeFile( + filePath, + `results: + repo: EntityProcess/agentv-evals +`, + ); + + const result = await validateConfigFile(filePath); + + const fieldErrors = result.errors.filter( + (e) => e.severity === 'error' && e.location === 'results.mode', + ); + expect(fieldErrors).toHaveLength(1); + }); + + it('errors on old-style subdirectory path', async () => { + const filePath = path.join(tempDir, 'config-results-old-path.yaml'); + await writeFile( + filePath, + `results: + mode: github + repo: EntityProcess/agentv-evals + path: autopilot-dev/runs +`, + ); + + const result = await validateConfigFile(filePath); + + const fieldErrors = result.errors.filter( + (e) => e.severity === 'error' && e.location === 'results.path', + ); + expect(fieldErrors).toHaveLength(1); + }); + it('errors on invalid required_version type', async () => { const filePath = path.join(tempDir, 'config-bad-version.yaml'); await writeFile(filePath, 'required_version: 3\n');