diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts new file mode 100644 index 000000000..1ab92bf9e --- /dev/null +++ b/apps/cli/src/commands/results/eval-runner.ts @@ -0,0 +1,493 @@ +/** + * Studio eval runner — discovery, launch, and status tracking for eval runs + * initiated from the Studio UI. + * + * Provides Hono route handlers for: + * - GET /api/eval/discover — discover eval files in the project + * - GET /api/eval/targets — list available target names + * - POST /api/eval/run — launch an eval run as a child process + * - GET /api/eval/status/:id — poll running eval status + * - GET /api/eval/runs — list active and recent Studio-launched runs + * + * All handlers accept a `cwd` (project root) to resolve paths against. + * The module spawns `bun apps/cli/src/cli.ts eval run ...` and tracks + * process state in memory. + */ + +import { type ChildProcess, spawn } from 'node:child_process'; +import { existsSync } from 'node:fs'; +import path from 'node:path'; +import { listTargetNames, readTargetDefinitions } from '@agentv/core'; +import type { Context } from 'hono'; +import type { Hono } from 'hono'; + +import { TARGET_FILE_CANDIDATES, discoverTargetsFile } from '../../utils/targets.js'; +import { discoverEvalFiles } from '../eval/discover.js'; +import { findRepoRoot } from '../eval/shared.js'; + +// ── In-memory run tracker ──────────────────────────────────────────────── + +interface StudioRun { + id: string; + status: 'starting' | 'running' | 'finished' | 'failed'; + command: string; + startedAt: string; + finishedAt?: string; + exitCode?: number | null; + stdout: string; + stderr: string; + process?: ChildProcess; +} + +const activeRuns = new Map(); + +function generateRunId(): string { + const now = new Date(); + const pad = (n: number, w = 2) => String(n).padStart(w, '0'); + const ts = `${now.getFullYear()}${pad(now.getMonth() + 1)}${pad(now.getDate())}-${pad(now.getHours())}${pad(now.getMinutes())}${pad(now.getSeconds())}`; + const rand = Math.random().toString(36).slice(2, 6); + return `studio-${ts}-${rand}`; +} + +// Keep only last 20 finished runs to prevent unbounded memory growth +function pruneFinishedRuns() { + const finished = [...activeRuns.entries()] + .filter(([, r]) => r.status === 'finished' || r.status === 'failed') + .sort((a, b) => (b[1].finishedAt ?? '').localeCompare(a[1].finishedAt ?? '')); + if (finished.length > 20) { + for (const [id] of finished.slice(20)) { + activeRuns.delete(id); + } + } +} + +// ── Discover targets file from project root ────────────────────────────── + +async function discoverTargetsInProject(cwd: string): Promise { + const repoRoot = (await findRepoRoot(cwd)) ?? cwd; + + // Try to find a targets file using the standard discovery + let targetsFilePath: string | undefined; + for (const candidate of TARGET_FILE_CANDIDATES) { + const fullPath = path.join(cwd, candidate); + if (existsSync(fullPath)) { + targetsFilePath = fullPath; + break; + } + } + if (!targetsFilePath) { + for (const candidate of TARGET_FILE_CANDIDATES) { + const fullPath = path.join(repoRoot, candidate); + if (existsSync(fullPath)) { + targetsFilePath = fullPath; + break; + } + } + } + + if (!targetsFilePath) return []; + + try { + const definitions = await readTargetDefinitions(targetsFilePath); + return listTargetNames(definitions); + } catch { + return []; + } +} + +// ── Build CLI command from request body ────────────────────────────────── + +interface RunEvalRequest { + suite_filter?: string; + test_ids?: string[]; + target?: string; + threshold?: number; + workers?: number; + dry_run?: boolean; +} + +function buildCliArgs(req: RunEvalRequest): string[] { + const args: string[] = ['eval']; + + // Suite filter (eval paths/globs) + if (req.suite_filter?.trim()) { + for (const part of req.suite_filter.split(',')) { + const trimmed = part.trim(); + if (trimmed) args.push(trimmed); + } + } + + // Test ID filters + if (req.test_ids && req.test_ids.length > 0) { + for (const id of req.test_ids) { + const trimmed = id.trim(); + if (trimmed) { + args.push('--test-id', trimmed); + } + } + } + + // Target override + if (req.target?.trim()) { + args.push('--target', req.target.trim()); + } + + // Threshold + if (req.threshold !== undefined && req.threshold !== null) { + args.push('--threshold', String(req.threshold)); + } + + // Workers + if (req.workers !== undefined && req.workers !== null) { + args.push('--workers', String(req.workers)); + } + + // Dry run + if (req.dry_run) { + args.push('--dry-run'); + } + + return args; +} + +function buildCliPreview(args: string[]): string { + return `agentv ${args.map((a) => (a.includes(' ') || a.includes('*') ? `"${a}"` : a)).join(' ')}`; +} + +// ── Resolve the bun + cli.ts path ──────────────────────────────────────── + +function resolveCliPath(cwd: string): { bunPath: string; cliPath: string } | undefined { + // Try to find cli.ts in the project (monorepo dev context) + const candidates = [ + path.join(cwd, 'apps/cli/src/cli.ts'), + path.join(cwd, 'apps/cli/dist/cli.js'), + ]; + for (const c of candidates) { + if (existsSync(c)) { + return { bunPath: 'bun', cliPath: c }; + } + } + + // Try from the current running process location + const currentDir = + typeof __dirname !== 'undefined' ? __dirname : path.dirname(new URL(import.meta.url).pathname); + const fromSrc = path.resolve(currentDir, '../../../cli.ts'); + const fromDist = path.resolve(currentDir, '../../cli.js'); + + if (existsSync(fromSrc)) return { bunPath: 'bun', cliPath: fromSrc }; + if (existsSync(fromDist)) return { bunPath: 'bun', cliPath: fromDist }; + + return undefined; +} + +// ── Route registration ─────────────────────────────────────────────────── + +// biome-ignore lint/suspicious/noExplicitAny: Hono Context generic varies by route +type C = Context; + +export function registerEvalRoutes(app: Hono, getCwd: (c: C) => string) { + // ── Discovery: eval files ────────────────────────────────────────────── + app.get('/api/eval/discover', async (c) => { + const cwd = getCwd(c); + try { + const files = await discoverEvalFiles(cwd); + return c.json({ + eval_files: files.map((f) => ({ + path: f.path, + relative_path: f.relativePath, + category: f.category, + })), + }); + } catch (err) { + return c.json({ error: (err as Error).message, eval_files: [] }, 500); + } + }); + + // ── Discovery: targets ───────────────────────────────────────────────── + app.get('/api/eval/targets', async (c) => { + const cwd = getCwd(c); + try { + const names = await discoverTargetsInProject(cwd); + return c.json({ targets: names }); + } catch (err) { + return c.json({ error: (err as Error).message, targets: [] }, 500); + } + }); + + // ── Launch eval run ──────────────────────────────────────────────────── + app.post('/api/eval/run', async (c) => { + const cwd = getCwd(c); + + let body: RunEvalRequest; + try { + body = await c.req.json(); + } catch { + return c.json({ error: 'Invalid JSON body' }, 400); + } + + // Validate: need at least a suite filter + if (!body.suite_filter?.trim() && (!body.test_ids || body.test_ids.length === 0)) { + return c.json({ error: 'Provide suite_filter or test_ids' }, 400); + } + + const cliPaths = resolveCliPath(cwd); + if (!cliPaths) { + return c.json({ error: 'Cannot locate agentv CLI entry point' }, 500); + } + + const args = buildCliArgs(body); + const command = buildCliPreview(args); + const runId = generateRunId(); + + const run: StudioRun = { + id: runId, + status: 'starting', + command, + startedAt: new Date().toISOString(), + stdout: '', + stderr: '', + }; + activeRuns.set(runId, run); + + try { + const child = spawn(cliPaths.bunPath, [cliPaths.cliPath, ...args], { + cwd, + stdio: ['ignore', 'pipe', 'pipe'], + env: { ...process.env }, + }); + + run.process = child; + run.status = 'running'; + + child.stdout?.on('data', (chunk: Buffer) => { + run.stdout += chunk.toString(); + // Cap buffer at 100KB + if (run.stdout.length > 100_000) { + run.stdout = run.stdout.slice(-80_000); + } + }); + + child.stderr?.on('data', (chunk: Buffer) => { + run.stderr += chunk.toString(); + if (run.stderr.length > 100_000) { + run.stderr = run.stderr.slice(-80_000); + } + }); + + child.on('close', (code) => { + run.exitCode = code; + run.status = code === 0 ? 'finished' : 'failed'; + run.finishedAt = new Date().toISOString(); + run.process = undefined; + pruneFinishedRuns(); + }); + + child.on('error', (err) => { + run.status = 'failed'; + run.stderr += `\nProcess error: ${err.message}`; + run.finishedAt = new Date().toISOString(); + run.process = undefined; + }); + + return c.json( + { + id: runId, + status: run.status, + command, + }, + 202, + ); + } catch (err) { + run.status = 'failed'; + run.stderr = (err as Error).message; + run.finishedAt = new Date().toISOString(); + return c.json({ error: (err as Error).message }, 500); + } + }); + + // ── Run status ───────────────────────────────────────────────────────── + app.get('/api/eval/status/:id', (c) => { + const id = c.req.param('id'); + const run = activeRuns.get(id ?? ''); + if (!run) return c.json({ error: 'Run not found' }, 404); + + return c.json({ + id: run.id, + status: run.status, + command: run.command, + started_at: run.startedAt, + finished_at: run.finishedAt ?? null, + exit_code: run.exitCode ?? null, + stdout: run.stdout.slice(-10_000), + stderr: run.stderr.slice(-5_000), + }); + }); + + // ── List runs ────────────────────────────────────────────────────────── + app.get('/api/eval/runs', (c) => { + const runs = [...activeRuns.values()].map((r) => ({ + id: r.id, + status: r.status, + command: r.command, + started_at: r.startedAt, + finished_at: r.finishedAt ?? null, + exit_code: r.exitCode ?? null, + })); + runs.sort((a, b) => b.started_at.localeCompare(a.started_at)); + return c.json({ runs }); + }); + + // ── CLI preview (dry endpoint) ───────────────────────────────────────── + app.post('/api/eval/preview', async (c) => { + let body: RunEvalRequest; + try { + body = await c.req.json(); + } catch { + return c.json({ error: 'Invalid JSON body' }, 400); + } + + const args = buildCliArgs(body); + return c.json({ command: buildCliPreview(args) }); + }); + + // ── Project-scoped variants ──────────────────────────────────────────── + app.get('/api/projects/:projectId/eval/discover', async (c) => { + const cwd = getCwd(c); + try { + const files = await discoverEvalFiles(cwd); + return c.json({ + eval_files: files.map((f) => ({ + path: f.path, + relative_path: f.relativePath, + category: f.category, + })), + }); + } catch (err) { + return c.json({ error: (err as Error).message, eval_files: [] }, 500); + } + }); + + app.get('/api/projects/:projectId/eval/targets', async (c) => { + const cwd = getCwd(c); + try { + const names = await discoverTargetsInProject(cwd); + return c.json({ targets: names }); + } catch (err) { + return c.json({ error: (err as Error).message, targets: [] }, 500); + } + }); + + app.post('/api/projects/:projectId/eval/run', async (c) => { + const cwd = getCwd(c); + + let body: RunEvalRequest; + try { + body = await c.req.json(); + } catch { + return c.json({ error: 'Invalid JSON body' }, 400); + } + + if (!body.suite_filter?.trim() && (!body.test_ids || body.test_ids.length === 0)) { + return c.json({ error: 'Provide suite_filter or test_ids' }, 400); + } + + const cliPaths = resolveCliPath(cwd); + if (!cliPaths) { + return c.json({ error: 'Cannot locate agentv CLI entry point' }, 500); + } + + const args = buildCliArgs(body); + const command = buildCliPreview(args); + const runId = generateRunId(); + + const run: StudioRun = { + id: runId, + status: 'starting', + command, + startedAt: new Date().toISOString(), + stdout: '', + stderr: '', + }; + activeRuns.set(runId, run); + + try { + const child = spawn(cliPaths.bunPath, [cliPaths.cliPath, ...args], { + cwd, + stdio: ['ignore', 'pipe', 'pipe'], + env: { ...process.env }, + }); + + run.process = child; + run.status = 'running'; + + child.stdout?.on('data', (chunk: Buffer) => { + run.stdout += chunk.toString(); + if (run.stdout.length > 100_000) run.stdout = run.stdout.slice(-80_000); + }); + child.stderr?.on('data', (chunk: Buffer) => { + run.stderr += chunk.toString(); + if (run.stderr.length > 100_000) run.stderr = run.stderr.slice(-80_000); + }); + child.on('close', (code) => { + run.exitCode = code; + run.status = code === 0 ? 'finished' : 'failed'; + run.finishedAt = new Date().toISOString(); + run.process = undefined; + pruneFinishedRuns(); + }); + child.on('error', (err) => { + run.status = 'failed'; + run.stderr += `\nProcess error: ${err.message}`; + run.finishedAt = new Date().toISOString(); + run.process = undefined; + }); + + return c.json({ id: runId, status: run.status, command }, 202); + } catch (err) { + run.status = 'failed'; + run.stderr = (err as Error).message; + run.finishedAt = new Date().toISOString(); + return c.json({ error: (err as Error).message }, 500); + } + }); + + app.get('/api/projects/:projectId/eval/status/:id', (c) => { + const id = c.req.param('id'); + const run = activeRuns.get(id ?? ''); + if (!run) return c.json({ error: 'Run not found' }, 404); + return c.json({ + id: run.id, + status: run.status, + command: run.command, + started_at: run.startedAt, + finished_at: run.finishedAt ?? null, + exit_code: run.exitCode ?? null, + stdout: run.stdout.slice(-10_000), + stderr: run.stderr.slice(-5_000), + }); + }); + + app.get('/api/projects/:projectId/eval/runs', (c) => { + const runs = [...activeRuns.values()].map((r) => ({ + id: r.id, + status: r.status, + command: r.command, + started_at: r.startedAt, + finished_at: r.finishedAt ?? null, + exit_code: r.exitCode ?? null, + })); + runs.sort((a, b) => b.started_at.localeCompare(a.started_at)); + return c.json({ runs }); + }); + + app.post('/api/projects/:projectId/eval/preview', async (c) => { + let body: RunEvalRequest; + try { + body = await c.req.json(); + } catch { + return c.json({ error: 'Invalid JSON body' }, 400); + } + const args = buildCliArgs(body); + return c.json({ command: buildCliPreview(args) }); + }); +} diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 92f2b20d5..736174d6e 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -46,6 +46,7 @@ import { parseJsonlResults } from '../eval/artifact-writer.js'; import { resolveRunManifestPath } from '../eval/result-layout.js'; import { loadRunCache, resolveRunCacheFile } from '../eval/run-cache.js'; import { listResultFiles } from '../trace/utils.js'; +import { registerEvalRoutes } from './eval-runner.js'; import { loadLightweightResults, loadManifestResults, @@ -892,6 +893,18 @@ export function createApp( app.get('/api/projects/:projectId/targets', (c) => withProject(c, handleTargets)); app.get('/api/projects/:projectId/feedback', (c) => withProject(c, handleFeedbackRead)); + // ── Eval runner routes (discovery, launch, status) ──────────────────── + + registerEvalRoutes(app, (c) => { + // For project-scoped routes, resolve to project path; otherwise use searchDir + const projectId = c.req.param('projectId'); + if (projectId) { + const project = getProject(projectId); + if (project) return project.path; + } + return searchDir; + }); + // ── Static file serving for Studio SPA ──────────────────────────────── const studioDistPath = options?.studioDir ?? resolveStudioDistDir(); diff --git a/apps/studio/src/components/RunEvalModal.tsx b/apps/studio/src/components/RunEvalModal.tsx new file mode 100644 index 000000000..886f53eb2 --- /dev/null +++ b/apps/studio/src/components/RunEvalModal.tsx @@ -0,0 +1,457 @@ +/** + * RunEvalModal — a compact wizard for launching eval runs from Studio. + * + * Two-step flow: + * Step 1: What to run (suite filter, test-id pills, target override) + * Step 2: Advanced options (threshold, workers) — collapsed by default + * + * Shows a CLI preview before launch, then tracks run status. + * + * Entry points pass optional prefill props (e.g., from a run detail page + * or eval detail page) so the modal opens pre-populated. + */ + +import { useQueryClient } from '@tanstack/react-query'; +import { useCallback, useEffect, useMemo, useState } from 'react'; + +import { + launchEvalRun, + previewEvalCommand, + useEvalDiscover, + useEvalRunStatus, + useEvalTargets, +} from '~/lib/api'; +import type { RunEvalRequest } from '~/lib/types'; + +// ── Props ──────────────────────────────────────────────────────────────── + +export interface RunEvalModalProps { + open: boolean; + onClose: () => void; + projectId?: string; + prefill?: { + suiteFilter?: string; + testIds?: string[]; + target?: string; + }; +} + +// ── Component ──────────────────────────────────────────────────────────── + +export function RunEvalModal({ open, onClose, projectId, prefill }: RunEvalModalProps) { + const queryClient = useQueryClient(); + + // Form state + const [suiteFilter, setSuiteFilter] = useState(prefill?.suiteFilter ?? ''); + const [testIdInput, setTestIdInput] = useState(''); + const [testIds, setTestIds] = useState(prefill?.testIds ?? []); + const [target, setTarget] = useState(prefill?.target ?? ''); + const [threshold, setThreshold] = useState(''); + const [workers, setWorkers] = useState(''); + const [dryRun, setDryRun] = useState(false); + const [showAdvanced, setShowAdvanced] = useState(false); + + // Run state + const [activeRunId, setActiveRunId] = useState(null); + const [error, setError] = useState(null); + const [launching, setLaunching] = useState(false); + const [cliPreview, setCliPreview] = useState(null); + + // Data + const { data: discoverData } = useEvalDiscover(projectId); + const { data: targetsData } = useEvalTargets(projectId); + const { data: runStatus } = useEvalRunStatus(activeRunId); + + const evalFiles = useMemo(() => discoverData?.eval_files ?? [], [discoverData]); + const targetNames = useMemo(() => targetsData?.targets ?? [], [targetsData]); + + // Reset form when opening with new prefill + useEffect(() => { + if (open) { + setSuiteFilter(prefill?.suiteFilter ?? ''); + setTestIds(prefill?.testIds ?? []); + setTarget(prefill?.target ?? ''); + setTestIdInput(''); + setThreshold(''); + setWorkers(''); + setDryRun(false); + setShowAdvanced(false); + setActiveRunId(null); + setError(null); + setLaunching(false); + setCliPreview(null); + } + }, [open, prefill]); + + // When run finishes, refresh the runs list + useEffect(() => { + if (runStatus?.status === 'finished' || runStatus?.status === 'failed') { + queryClient.invalidateQueries({ queryKey: ['runs'] }); + queryClient.invalidateQueries({ queryKey: ['projects'] }); + } + }, [runStatus?.status, queryClient]); + + // Build request body from form state + const buildRequest = useCallback((): RunEvalRequest => { + const req: RunEvalRequest = {}; + if (suiteFilter.trim()) req.suite_filter = suiteFilter.trim(); + if (testIds.length > 0) req.test_ids = testIds; + if (target) req.target = target; + if (threshold) req.threshold = Number.parseFloat(threshold); + if (workers) req.workers = Number.parseInt(workers, 10); + if (dryRun) req.dry_run = true; + return req; + }, [suiteFilter, testIds, target, threshold, workers, dryRun]); + + // Update CLI preview when form changes + useEffect(() => { + const req = buildRequest(); + if (!req.suite_filter && (!req.test_ids || req.test_ids.length === 0)) { + setCliPreview(null); + return; + } + previewEvalCommand(req, projectId) + .then((r) => setCliPreview(r.command)) + .catch(() => setCliPreview(null)); + }, [buildRequest, projectId]); + + // Add a test ID pill + function addTestId() { + const trimmed = testIdInput.trim(); + if (trimmed && !testIds.includes(trimmed)) { + setTestIds([...testIds, trimmed]); + } + setTestIdInput(''); + } + + function removeTestId(id: string) { + setTestIds(testIds.filter((t) => t !== id)); + } + + // Launch + async function handleLaunch() { + setError(null); + setLaunching(true); + try { + const req = buildRequest(); + const result = await launchEvalRun(req, projectId); + setActiveRunId(result.id); + } catch (err) { + setError((err as Error).message); + } finally { + setLaunching(false); + } + } + + if (!open) return null; + + // ── Active run view ──────────────────────────────────────────────────── + + if (activeRunId && runStatus) { + return ( + + + + ); + } + + // ── Form view ────────────────────────────────────────────────────────── + + const canLaunch = !!(suiteFilter.trim() || testIds.length > 0); + + return ( + +
+ {/* Suite filter */} +
+ + setSuiteFilter(e.target.value)} + placeholder="evals/**/*.eval.yaml" + className="w-full rounded-md border border-gray-700 bg-gray-800 px-3 py-2 text-sm text-white placeholder-gray-500 focus:border-cyan-600 focus:outline-none" + /> + {evalFiles.length > 0 && !suiteFilter && ( +
+ {evalFiles.slice(0, 5).map((f) => ( + + ))} + {evalFiles.length > 5 && ( + +{evalFiles.length - 5} more + )} +
+ )} +
+ + {/* Test ID filter */} +
+ +
+ setTestIdInput(e.target.value)} + onKeyDown={(e) => { + if (e.key === 'Enter') { + e.preventDefault(); + addTestId(); + } + }} + placeholder="auth-*, retrieval-basic" + className="flex-1 rounded-md border border-gray-700 bg-gray-800 px-3 py-2 text-sm text-white placeholder-gray-500 focus:border-cyan-600 focus:outline-none" + /> + +
+ {testIds.length > 0 && ( +
+ {testIds.map((id) => ( + + {id} + + + ))} +
+ )} +
+ + {/* Target override */} +
+ + +
+ + {/* Advanced options */} +
+ + {showAdvanced && ( +
+
+ + setThreshold(e.target.value)} + min="0" + max="1" + step="0.1" + placeholder="0.8" + className="w-full rounded-md border border-gray-700 bg-gray-800 px-3 py-1.5 text-sm text-white placeholder-gray-500 focus:border-cyan-600 focus:outline-none" + /> +
+
+ + setWorkers(e.target.value)} + min="1" + max="50" + placeholder="3" + className="w-full rounded-md border border-gray-700 bg-gray-800 px-3 py-1.5 text-sm text-white placeholder-gray-500 focus:border-cyan-600 focus:outline-none" + /> +
+
+ +
+
+ )} +
+ + {/* CLI preview */} + {cliPreview && ( +
+
CLI Preview
+ {cliPreview} +
+ )} + + {/* Error */} + {error && ( +
+ {error} +
+ )} + + {/* Actions */} +
+ + +
+
+
+ ); +} + +// ── Sub-components ─────────────────────────────────────────────────────── + +function ModalShell({ + children, + onClose, + title, +}: { + children: React.ReactNode; + onClose: () => void; + title: string; +}) { + return ( +
+
+
+

{title}

+ +
+
{children}
+
+
+ ); +} + +function RunStatusView({ + status, + onClose, +}: { + status: import('~/lib/types').EvalRunStatus; + onClose: () => void; +}) { + const isTerminal = status.status === 'finished' || status.status === 'failed'; + + const statusColors: Record = { + starting: 'text-yellow-400', + running: 'text-cyan-400', + finished: 'text-emerald-400', + failed: 'text-red-400', + }; + + return ( +
+
+ + {status.status === 'running' && '●'}{' '} + {status.status.charAt(0).toUpperCase() + status.status.slice(1)} + + {!isTerminal && ( + + )} +
+ +
+ {status.command} +
+ + {status.stdout && ( +
+
+            {status.stdout.slice(-3000)}
+          
+
+ )} + + {status.stderr && ( +
+
+            {status.stderr.slice(-2000)}
+          
+
+ )} + + {isTerminal && ( +
+ + Exit code: {status.exit_code} + {status.finished_at && ` · ${new Date(status.finished_at).toLocaleTimeString()}`} + + +
+ )} +
+ ); +} diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index c2c4de029..07f2883b0 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -10,6 +10,11 @@ import { queryOptions, useQuery } from '@tanstack/react-query'; import type { CategoriesResponse, EvalDetailResponse, + EvalDiscoverResponse, + EvalPreviewResponse, + EvalRunResponse, + EvalRunStatus, + EvalTargetsResponse, ExperimentsResponse, FeedbackData, FileContentResponse, @@ -18,6 +23,7 @@ import type { ProjectEntry, ProjectListResponse, RunDetailResponse, + RunEvalRequest, RunListResponse, StudioConfigResponse, SuitesResponse, @@ -396,3 +402,79 @@ export async function saveStudioConfig( } return res.json() as Promise; } + +// ── Eval runner queries & mutations ────────────────────────────────────── + +export function evalDiscoverOptions(projectId?: string) { + const url = projectId ? `${projectApiBase(projectId)}/eval/discover` : '/api/eval/discover'; + return queryOptions({ + queryKey: ['eval-discover', projectId ?? ''], + queryFn: () => fetchJson(url), + staleTime: 30_000, + }); +} + +export function useEvalDiscover(projectId?: string) { + return useQuery(evalDiscoverOptions(projectId)); +} + +export function evalTargetsOptions(projectId?: string) { + const url = projectId ? `${projectApiBase(projectId)}/eval/targets` : '/api/eval/targets'; + return queryOptions({ + queryKey: ['eval-targets', projectId ?? ''], + queryFn: () => fetchJson(url), + staleTime: 30_000, + }); +} + +export function useEvalTargets(projectId?: string) { + return useQuery(evalTargetsOptions(projectId)); +} + +export async function launchEvalRun( + body: RunEvalRequest, + projectId?: string, +): Promise { + const url = projectId ? `${projectApiBase(projectId)}/eval/run` : '/api/eval/run'; + const res = await fetch(url, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(body), + }); + if (!res.ok) { + const err = await res.json().catch(() => ({ error: res.statusText })); + throw new Error((err as { error?: string }).error ?? `Failed: ${res.status}`); + } + return res.json() as Promise; +} + +export function evalRunStatusOptions(runId: string | null) { + return queryOptions({ + queryKey: ['eval-status', runId], + queryFn: () => fetchJson(`/api/eval/status/${runId}`), + enabled: !!runId, + refetchInterval: (query) => { + const status = query.state.data?.status; + if (status === 'finished' || status === 'failed') return false; + return 2_000; + }, + }); +} + +export function useEvalRunStatus(runId: string | null) { + return useQuery(evalRunStatusOptions(runId)); +} + +export async function previewEvalCommand( + body: RunEvalRequest, + projectId?: string, +): Promise { + const url = projectId ? `${projectApiBase(projectId)}/eval/preview` : '/api/eval/preview'; + const res = await fetch(url, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(body), + }); + if (!res.ok) throw new Error(`Preview failed: ${res.status}`); + return res.json() as Promise; +} diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index d924bebfd..a1c1c621e 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -195,3 +195,60 @@ export interface ProjectEntry { added_at: string; last_opened_at: string; } + +// ── Eval runner types ──────────────────────────────────────────────────── + +export interface DiscoveredEvalFile { + path: string; + relative_path: string; + category: string; +} + +export interface EvalDiscoverResponse { + eval_files: DiscoveredEvalFile[]; +} + +export interface EvalTargetsResponse { + targets: string[]; +} + +export interface RunEvalRequest { + suite_filter?: string; + test_ids?: string[]; + target?: string; + threshold?: number; + workers?: number; + dry_run?: boolean; +} + +export interface EvalRunResponse { + id: string; + status: string; + command: string; +} + +export interface EvalRunStatus { + id: string; + status: 'starting' | 'running' | 'finished' | 'failed'; + command: string; + started_at: string; + finished_at: string | null; + exit_code: number | null; + stdout: string; + stderr: string; +} + +export interface EvalRunListResponse { + runs: Array<{ + id: string; + status: string; + command: string; + started_at: string; + finished_at: string | null; + exit_code: number | null; + }>; +} + +export interface EvalPreviewResponse { + command: string; +} diff --git a/apps/studio/src/routes/evals/$runId.$evalId.tsx b/apps/studio/src/routes/evals/$runId.$evalId.tsx index d86cd61a9..27fb29e60 100644 --- a/apps/studio/src/routes/evals/$runId.$evalId.tsx +++ b/apps/studio/src/routes/evals/$runId.$evalId.tsx @@ -7,8 +7,10 @@ */ import { createFileRoute } from '@tanstack/react-router'; +import { useState } from 'react'; import { EvalDetail } from '~/components/EvalDetail'; +import { RunEvalModal } from '~/components/RunEvalModal'; import { useRunDetail } from '~/lib/api'; export const Route = createFileRoute('/evals/$runId/$evalId')({ @@ -18,6 +20,7 @@ export const Route = createFileRoute('/evals/$runId/$evalId')({ function EvalDetailPage() { const { runId, evalId } = Route.useParams(); const { data, isLoading, error } = useRunDetail(runId); + const [showRunEval, setShowRunEval] = useState(false); if (isLoading) { return ( @@ -51,13 +54,30 @@ function EvalDetailPage() { return (
-
-

- Run: {runId} / Eval: {evalId} -

-

{evalId}

+
+
+

+ Run: {runId} / Eval: {evalId} +

+

{evalId}

+
+
+ setShowRunEval(false)} + prefill={{ + testIds: [evalId], + target: result.target, + }} + />
); } diff --git a/apps/studio/src/routes/index.tsx b/apps/studio/src/routes/index.tsx index a51fac3c8..a700e4825 100644 --- a/apps/studio/src/routes/index.tsx +++ b/apps/studio/src/routes/index.tsx @@ -12,6 +12,7 @@ import { useState } from 'react'; import { useQueryClient } from '@tanstack/react-query'; import { ExperimentsTab } from '~/components/ExperimentsTab'; import { ProjectCard } from '~/components/ProjectCard'; +import { RunEvalModal } from '~/components/RunEvalModal'; import { RunList } from '~/components/RunList'; import { TargetsTab } from '~/components/TargetsTab'; import { addProjectApi, discoverProjectsApi, useProjectList, useRunList } from '~/lib/api'; @@ -52,6 +53,7 @@ function ProjectsDashboard() { const [discoverPath, setDiscoverPath] = useState(''); const [error, setError] = useState(null); const [showAddForm, setShowAddForm] = useState(false); + const [showRunEval, setShowRunEval] = useState(false); const projects = data?.projects ?? []; @@ -89,13 +91,22 @@ function ProjectsDashboard() {

Projects

- +
+ + +
{error && ( @@ -144,6 +155,8 @@ function ProjectsDashboard() { ))}
+ + setShowRunEval(false)} />
); } @@ -156,12 +169,22 @@ function SingleProjectHome() { const tab = searchParams.tab as TabId | undefined; const navigate = useNavigate(); const { data, isLoading, error } = useRunList(); + const [showRunEval, setShowRunEval] = useState(false); const activeTab: TabId = tabs.some((t) => t.id === tab) ? (tab as TabId) : 'runs'; return (
-

Evaluation Runs

+
+

Evaluation Runs

+ +
{/* Tab navigation */}
@@ -187,6 +210,8 @@ function SingleProjectHome() { {activeTab === 'runs' && } {activeTab === 'experiments' && } {activeTab === 'targets' && } + + setShowRunEval(false)} />
); } diff --git a/apps/studio/src/routes/projects/$projectId.tsx b/apps/studio/src/routes/projects/$projectId.tsx index bc2848130..493f38064 100644 --- a/apps/studio/src/routes/projects/$projectId.tsx +++ b/apps/studio/src/routes/projects/$projectId.tsx @@ -5,8 +5,10 @@ */ import { createFileRoute, useNavigate, useRouterState } from '@tanstack/react-router'; +import { useState } from 'react'; import { useQuery } from '@tanstack/react-query'; +import { RunEvalModal } from '~/components/RunEvalModal'; import { RunList } from '~/components/RunList'; import { useProjectRunList } from '~/lib/api'; import { projectExperimentsOptions, projectTargetsOptions } from '~/lib/api'; @@ -30,12 +32,22 @@ function ProjectHomePage() { const searchParams = routerState.location.search as Record; const tab = searchParams.tab as TabId | undefined; const navigate = useNavigate(); + const [showRunEval, setShowRunEval] = useState(false); const activeTab: TabId = tabs.some((t) => t.id === tab) ? (tab as TabId) : 'runs'; return (
-

{projectId}

+
+

{projectId}

+ +
{/* Tab navigation */}
@@ -66,6 +78,12 @@ function ProjectHomePage() { {activeTab === 'runs' && } {activeTab === 'experiments' && } {activeTab === 'targets' && } + + setShowRunEval(false)} + projectId={projectId} + />
); } diff --git a/apps/studio/src/routes/projects/$projectId_/evals/$runId.$evalId.tsx b/apps/studio/src/routes/projects/$projectId_/evals/$runId.$evalId.tsx index bc19d93d3..94499866c 100644 --- a/apps/studio/src/routes/projects/$projectId_/evals/$runId.$evalId.tsx +++ b/apps/studio/src/routes/projects/$projectId_/evals/$runId.$evalId.tsx @@ -3,8 +3,10 @@ */ import { createFileRoute } from '@tanstack/react-router'; +import { useState } from 'react'; import { EvalDetail } from '~/components/EvalDetail'; +import { RunEvalModal } from '~/components/RunEvalModal'; import { useProjectRunDetail } from '~/lib/api'; export const Route = createFileRoute('/projects/$projectId_/evals/$runId/$evalId')({ @@ -14,6 +16,7 @@ export const Route = createFileRoute('/projects/$projectId_/evals/$runId/$evalId function ProjectEvalDetailPage() { const { projectId, runId, evalId } = Route.useParams(); const { data, isLoading, error } = useProjectRunDetail(projectId, runId); + const [showRunEval, setShowRunEval] = useState(false); if (isLoading) { return ( @@ -47,13 +50,31 @@ function ProjectEvalDetailPage() { return (
-
-

- Run: {runId} / Eval: {evalId} -

-

{evalId}

+
+
+

+ Run: {runId} / Eval: {evalId} +

+

{evalId}

+
+
+ setShowRunEval(false)} + projectId={projectId} + prefill={{ + testIds: [evalId], + target: result.target, + }} + />
); } diff --git a/apps/studio/src/routes/projects/$projectId_/runs/$runId.tsx b/apps/studio/src/routes/projects/$projectId_/runs/$runId.tsx index eb3acbec2..f23ba6095 100644 --- a/apps/studio/src/routes/projects/$projectId_/runs/$runId.tsx +++ b/apps/studio/src/routes/projects/$projectId_/runs/$runId.tsx @@ -3,8 +3,10 @@ */ import { createFileRoute } from '@tanstack/react-router'; +import { useState } from 'react'; import { RunDetail } from '~/components/RunDetail'; +import { RunEvalModal } from '~/components/RunEvalModal'; import { useProjectRunDetail } from '~/lib/api'; export const Route = createFileRoute('/projects/$projectId_/runs/$runId')({ @@ -14,6 +16,7 @@ export const Route = createFileRoute('/projects/$projectId_/runs/$runId')({ function ProjectRunDetailPage() { const { projectId, runId } = Route.useParams(); const { data, isLoading, error } = useProjectRunDetail(projectId, runId); + const [showRunEval, setShowRunEval] = useState(false); if (isLoading) { return ( @@ -36,13 +39,31 @@ function ProjectRunDetailPage() { ); } + const firstResult = data?.results?.[0]; + const prefill = firstResult?.target ? { target: firstResult.target } : undefined; + return (
-
-

Run: {runId}

-

Source: {data?.source}

+
+
+

Run: {runId}

+

Source: {data?.source}

+
+
+ setShowRunEval(false)} + projectId={projectId} + prefill={prefill} + />
); } diff --git a/apps/studio/src/routes/runs/$runId.tsx b/apps/studio/src/routes/runs/$runId.tsx index 7dd9d7984..1ae307cf8 100644 --- a/apps/studio/src/routes/runs/$runId.tsx +++ b/apps/studio/src/routes/runs/$runId.tsx @@ -3,8 +3,10 @@ */ import { createFileRoute } from '@tanstack/react-router'; +import { useState } from 'react'; import { RunDetail } from '~/components/RunDetail'; +import { RunEvalModal } from '~/components/RunEvalModal'; import { useRunDetail } from '~/lib/api'; export const Route = createFileRoute('/runs/$runId')({ @@ -14,6 +16,7 @@ export const Route = createFileRoute('/runs/$runId')({ function RunDetailPage() { const { runId } = Route.useParams(); const { data, isLoading, error } = useRunDetail(runId); + const [showRunEval, setShowRunEval] = useState(false); if (isLoading) { return ( @@ -36,13 +39,27 @@ function RunDetailPage() { ); } + // Derive prefill from run data + const firstResult = data?.results?.[0]; + const prefill = firstResult?.target ? { target: firstResult.target } : undefined; + return (
-
-

Run: {runId}

-

Source: {data?.source}

+
+
+

Run: {runId}

+

Source: {data?.source}

+
+
+ setShowRunEval(false)} prefill={prefill} />
); }