Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 40 additions & 17 deletions apps/cli/src/commands/results/remote.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ import {
directPushResults,
directorySizeBytes,
getResultsRepoStatus,
listGitRuns,
loadConfig,
normalizeResultsConfig,
resolveResultsRepoRunsDir,
syncResultsRepo,
} from '@agentv/core';
Expand Down Expand Up @@ -59,15 +61,6 @@ function getStatusMessage(error: unknown): string {
return error instanceof Error ? error.message : String(error);
}

function normalizeResultsConfig(config: ResultsConfig): Required<ResultsConfig> {
return {
repo: config.repo,
path: config.path,
auto_push: config.auto_push === true,
branch_prefix: config.branch_prefix?.trim() || 'eval-results',
};
}

function statusForResult(result: EvaluationResult): 'PASS' | 'FAIL' | 'ERROR' {
if (result.executionStatus === 'execution_error' || result.error) {
return 'ERROR';
Expand Down Expand Up @@ -185,15 +178,45 @@ export async function listMergedResultFiles(
};
}

const remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
(meta) =>
({
...meta,
filename: encodeRemoteRunId(meta.filename),
raw_filename: meta.filename,
let remoteRuns: SourcedResultFileMeta[] = [];
if (config.mode === 'github') {
try {
const gitRuns = await listGitRuns(config.path);
remoteRuns = gitRuns.map((r) => ({
filename: encodeRemoteRunId(r.run_id),
raw_filename: r.run_id,
source: 'remote' as const,
}) satisfies SourcedResultFileMeta,
);
path: path.join(config.path, r.manifest_path),
displayName: r.display_name,
timestamp: r.timestamp,
testCount: r.test_count,
passRate: r.pass_rate || 0,
avgScore: r.avg_score || 0,
sizeBytes: r.size_bytes || 0,
}));
} catch (error) {
console.error('git-native listing failed, falling back', error);
remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
(meta) =>
({
...meta,
filename: encodeRemoteRunId(meta.filename),
raw_filename: meta.filename,
source: 'remote' as const,
}) satisfies SourcedResultFileMeta,
);
}
} else {
remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
(meta) =>
({
...meta,
filename: encodeRemoteRunId(meta.filename),
raw_filename: meta.filename,
source: 'remote' as const,
}) satisfies SourcedResultFileMeta,
);
}

const merged = [...localRuns, ...remoteRuns].sort((a, b) =>
b.timestamp.localeCompare(a.timestamp),
Expand Down
132 changes: 93 additions & 39 deletions apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -274,49 +274,103 @@ function inferExperimentFromRunId(runId: string): string | undefined {
return experiment;
}

const DEFAULT_RUN_PAGE_LIMIT = 50;

function parseRunPageLimit(limitParam: string | undefined): number | undefined | null {
if (limitParam === undefined) {
return undefined;
}
if (!/^\d+$/.test(limitParam)) {
return null;
}
const limit = Number.parseInt(limitParam, 10);
return limit > 0 ? limit : null;
}

function paginateRuns<T extends { filename: string }>(
runs: T[],
cursor: string | undefined,
limit: number | undefined,
): { runs: T[]; nextCursor?: string } {
if (limit === undefined) {
return { runs };
}

if (!cursor) {
const page = runs.slice(0, limit);
return {
runs: page,
...(limit < runs.length && page.length > 0 ? { nextCursor: page.at(-1)?.filename } : {}),
};
}

const cursorIndex = runs.findIndex((run) => run.filename === cursor);
if (cursorIndex === -1) {
return { runs: [] };
}

const page = runs.slice(cursorIndex + 1, cursorIndex + 1 + limit);
return {
runs: page,
...(cursorIndex + 1 + limit < runs.length && page.length > 0
? { nextCursor: page.at(-1)?.filename }
: {}),
};
}

async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
const { runs: metas } = await listMergedResultFiles(searchDir);
const { threshold: passThreshold } = loadStudioConfig(agentvDir);
return c.json({
runs: metas.map((m) => {
let target: string | undefined;
let experiment = inferExperimentFromRunId(m.raw_filename);
let passRate = m.passRate;
try {
const records = loadLightweightResults(m.path);
if (records.length > 0) {
target = records[0].target;
experiment = records[0].experiment ?? experiment;
passRate = records.filter((r) => r.score >= passThreshold).length / records.length;
} else {
// Run is in-progress with 0 results written yet — fall back to the
// in-memory target stored when the Studio launched this run.
target = getActiveRunTarget(m.path);
}
} catch {
// ignore enrichment errors
const parsedLimit = parseRunPageLimit(c.req.query('limit'));
if (parsedLimit === null) {
return c.json({ error: 'limit must be a positive integer' }, 400);
}

const cursor = c.req.query('cursor');
const limit = parsedLimit ?? (cursor ? DEFAULT_RUN_PAGE_LIMIT : undefined);
const runs = metas.map((m) => {
let target: string | undefined;
let experiment = inferExperimentFromRunId(m.raw_filename);
let passRate = m.passRate;
try {
const records = loadLightweightResults(m.path);
if (records.length > 0) {
target = records[0].target;
experiment = records[0].experiment ?? experiment;
passRate = records.filter((r) => r.score >= passThreshold).length / records.length;
} else {
// Run is in-progress with 0 results written yet — fall back to the
// in-memory target stored when the Studio launched this run.
target = getActiveRunTarget(m.path);
}
// Surface live status for Studio-launched runs that are still starting
// or running so the RunList can render a spinner instead of the
// pass/fail dot derived from a 0% pass rate.
const liveStatus = getActiveRunStatus(m.path);
const tagsEntry = readRunTags(m.path);
return {
filename: m.filename,
display_name: m.displayName,
path: m.path,
timestamp: m.timestamp,
test_count: m.testCount,
pass_rate: passRate,
avg_score: m.avgScore,
size_bytes: m.sizeBytes,
source: m.source,
...(target && { target }),
...(experiment && { experiment }),
...(tagsEntry && { tags: tagsEntry.tags }),
...(liveStatus && { status: liveStatus }),
};
}),
} catch {
// ignore enrichment errors
}
// Surface live status for Studio-launched runs that are still starting
// or running so the RunList can render a spinner instead of the
// pass/fail dot derived from a 0% pass rate.
const liveStatus = getActiveRunStatus(m.path);
const tagsEntry = readRunTags(m.path);
return {
filename: m.filename,
display_name: m.displayName,
path: m.path,
timestamp: m.timestamp,
test_count: m.testCount,
pass_rate: passRate,
avg_score: m.avgScore,
size_bytes: m.sizeBytes,
source: m.source,
...(target && { target }),
...(experiment && { experiment }),
...(tagsEntry && { tags: tagsEntry.tags }),
...(liveStatus && { status: liveStatus }),
};
});
const page = paginateRuns(runs, cursor, limit);
return c.json({
runs: page.runs,
...(page.nextCursor ? { next_cursor: page.nextCursor } : {}),
});
}

Expand Down
107 changes: 59 additions & 48 deletions apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts
Original file line number Diff line number Diff line change
@@ -1,66 +1,77 @@
import { mkdir, readFile, rm, writeFile } from 'node:fs/promises';
import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, describe, expect, it } from 'vitest';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';

const FIXTURE_DIR = join(import.meta.dirname, 'fixtures');
const OUT_DIR = join(import.meta.dirname, '__tmp_pipeline_e2e__');
const CLI_ENTRY = join(import.meta.dirname, '../../../../src/cli.ts');
const EVAL_PATH = join(FIXTURE_DIR, 'input-test.eval.yaml');
const PIPELINE_E2E_TIMEOUT_MS = 60_000;

describe('eval pipeline e2e', () => {
let outDir: string;

beforeEach(async () => {
outDir = await mkdtemp(join(tmpdir(), 'agentv-pipeline-e2e-'));
});

afterEach(async () => {
await rm(OUT_DIR, { recursive: true, force: true });
await rm(outDir, { recursive: true, force: true });
});

it('runs full input → grade → bench pipeline', async () => {
const { execa } = await import('execa');
it(
'runs full input → grade → bench pipeline',
async () => {
const { execa } = await import('execa');

// Step 1: pipeline input
await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]);
const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8'));
expect(manifest.test_ids).toEqual(['test-01']);
// Step 1: pipeline input
await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', outDir]);
const manifest = JSON.parse(await readFile(join(outDir, 'manifest.json'), 'utf8'));
expect(manifest.test_ids).toEqual(['test-01']);

// Step 2: Write mock response.md (simulating target execution)
await writeFile(join(OUT_DIR, 'input-test', 'test-01', 'response.md'), 'hello world response');
// Step 2: Write mock response.md (simulating target execution)
await writeFile(join(outDir, 'input-test', 'test-01', 'response.md'), 'hello world response');

// Step 3: pipeline grade
await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', OUT_DIR]);
const gradeResult = JSON.parse(
await readFile(
join(OUT_DIR, 'input-test', 'test-01', 'code_grader_results', 'contains_hello.json'),
'utf8',
),
);
expect(gradeResult.score).toBe(1);
// Step 3: pipeline grade
await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', outDir]);
const gradeResult = JSON.parse(
await readFile(
join(outDir, 'input-test', 'test-01', 'code_grader_results', 'contains_hello.json'),
'utf8',
),
);
expect(gradeResult.score).toBe(1);

// Step 4: Write mock LLM grader result to disk, then run pipeline bench
const llmResultsDir = join(OUT_DIR, 'input-test', 'test-01', 'llm_grader_results');
await mkdir(llmResultsDir, { recursive: true });
await writeFile(
join(llmResultsDir, 'relevance.json'),
JSON.stringify({
score: 0.9,
assertions: [{ text: 'Response is relevant', passed: true, evidence: 'echoes input' }],
}),
);
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);
// Step 4: Write mock LLM grader result to disk, then run pipeline bench
const llmResultsDir = join(outDir, 'input-test', 'test-01', 'llm_grader_results');
await mkdir(llmResultsDir, { recursive: true });
await writeFile(
join(llmResultsDir, 'relevance.json'),
JSON.stringify({
score: 0.9,
assertions: [{ text: 'Response is relevant', passed: true, evidence: 'echoes input' }],
}),
);
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', outDir]);

// Verify final artifacts
const grading = JSON.parse(
await readFile(join(OUT_DIR, 'input-test', 'test-01', 'grading.json'), 'utf8'),
);
expect(grading.graders).toHaveLength(2);
expect(grading.summary.pass_rate).toBeGreaterThan(0);
// Verify final artifacts
const grading = JSON.parse(
await readFile(join(outDir, 'input-test', 'test-01', 'grading.json'), 'utf8'),
);
expect(grading.graders).toHaveLength(2);
expect(grading.summary.pass_rate).toBeGreaterThan(0);

const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
const indexLines = indexContent
.trim()
.split('\n')
.map((line) => JSON.parse(line));
expect(indexLines).toHaveLength(1);
expect(indexLines[0].test_id).toBe('test-01');
const indexContent = await readFile(join(outDir, 'index.jsonl'), 'utf8');
const indexLines = indexContent
.trim()
.split('\n')
.map((line) => JSON.parse(line));
expect(indexLines).toHaveLength(1);
expect(indexLines[0].test_id).toBe('test-01');

const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8'));
expect(benchmark.run_summary).toBeDefined();
}, 30_000);
const benchmark = JSON.parse(await readFile(join(outDir, 'benchmark.json'), 'utf8'));
expect(benchmark.run_summary).toBeDefined();
},
PIPELINE_E2E_TIMEOUT_MS,
);
});
Loading
Loading