From 0e35e5f0b10c2c9db10094031a2ac92e59fff9f3 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 23 Apr 2026 13:40:27 -0500
Subject: [PATCH 01/55] feat: agentic benchmark ingest + UI with offload-mode
 halo

Adds agentic_traces scenario end-to-end:
- Schema migrations for agentic scenario, availability, and KV offload mode
- DB ingest/ETL + query updates to carry scenario, offload_mode, and
  server/theoretical cache-hit rates through to the API layer
- Frontend types, filters (GlobalFilterContext / InferenceContext /
  ChartControls), URL state, and tooltip rows for agentic-only fields
- ScatterGraph: subtle dashed halo on Pareto-frontier points that used
  KV offload so the tradeoff is visible at a glance
---
 packages/app/cypress/support/mock-data.ts     |   2 +
 .../app/src/app/api/unofficial-run/route.ts   |   2 +
 .../src/components/GlobalFilterContext.tsx    |  12 +-
 .../components/inference/InferenceContext.tsx |  15 ++-
 .../inference/hooks/useChartData.ts           |  34 +++--
 .../app/src/components/inference/types.ts     |  26 ++++
 .../components/inference/ui/ChartControls.tsx |  27 +++-
 .../components/inference/ui/ScatterGraph.tsx  |  21 +++
 .../inference/utils/tooltipUtils.ts           |  54 +++++++-
 .../app/src/components/ui/chart-selectors.tsx | 124 ++++++++++++++++++
 .../unofficial-run-provider.test.ts           |   2 +
 .../components/unofficial-run-provider.tsx    |   4 +-
 packages/app/src/lib/api.ts                   |  14 +-
 .../app/src/lib/benchmark-transform.test.ts   |   2 +
 packages/app/src/lib/benchmark-transform.ts   |  65 ++++++++-
 packages/app/src/lib/data-mappings.ts         |  72 +++++++++-
 packages/app/src/lib/url-state.ts             |   2 +
 packages/constants/src/models.ts              |  17 +++
 .../db/migrations/002_agentic_scenario.sql    |  30 +++++
 .../migrations/003_agentic_availability.sql   |  21 +++
 packages/db/migrations/004_offload_mode.sql   |  42 ++++++
 packages/db/src/etl/benchmark-ingest.ts       |  28 ++--
 packages/db/src/etl/benchmark-mapper.ts       |  45 ++++++-
 packages/db/src/ingest-ci-run.ts              |   6 +-
 packages/db/src/ingest-gcs-backup.ts          |   6 +-
 packages/db/src/ingest-supplemental.ts        |  14 +-
 packages/db/src/json-provider.ts              |   8 +-
 packages/db/src/queries/benchmarks.ts         |  13 +-
 packages/db/src/queries/workflow-info.ts      |  15 ++-
 29 files changed, 645 insertions(+), 78 deletions(-)
 create mode 100644 packages/db/migrations/002_agentic_scenario.sql
 create mode 100644 packages/db/migrations/003_agentic_availability.sql
 create mode 100644 packages/db/migrations/004_offload_mode.sql

diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts
index e6720c0b..7a4f59a9 100644
--- a/packages/app/cypress/support/mock-data.ts
+++ b/packages/app/cypress/support/mock-data.ts
@@ -189,6 +189,8 @@ export function createMockInferenceContext(
     workflowInfo: null,
     selectedYAxisMetric: 'y_tpPerGpu',
     setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'),
+    selectedPercentile: 'median',
+    setSelectedPercentile: namedStub('setSelectedPercentile'),
     selectedXAxisMetric: null,
     setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'),
     selectedE2eXAxisMetric: null,
diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts
index 79ac0665..dbfb9c33 100644
--- a/packages/app/src/app/api/unofficial-run/route.ts
+++ b/packages/app/src/app/api/unofficial-run/route.ts
@@ -49,6 +49,8 @@ export function normalizeArtifactRows(
       decode_num_workers: config.decodeNumWorkers,
       num_prefill_gpu: config.numPrefillGpu,
       num_decode_gpu: config.numDecodeGpu,
+      benchmark_type: params.benchmarkType,
+      offload_mode: params.offloadMode,
       isl: params.isl,
       osl: params.osl,
       conc: params.conc,
diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx
index 65f510cd..f603081a 100644
--- a/packages/app/src/components/GlobalFilterContext.tsx
+++ b/packages/app/src/components/GlobalFilterContext.tsx
@@ -11,7 +11,7 @@ import {
   useState,
 } from 'react';
 
-import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants';
+import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants';
 
 import { useAvailability } from '@/hooks/api/use-availability';
 import { useWorkflowInfo } from '@/hooks/api/use-workflow-info';
@@ -172,11 +172,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) {
   const availableSequences = useMemo(() => {
     if (!availabilityRows) return SEQUENCE_OPTIONS;
     const seqs = [
-      ...new Set(
-        modelRows
-          .map((r) => islOslToSequence(r.isl, r.osl))
-          .filter((s): s is Sequence => s !== null),
-      ),
+      ...new Set(modelRows.map((r) => rowToSequence(r)).filter((s): s is Sequence => s !== null)),
     ];
     return seqs.length > 0 ? seqs : SEQUENCE_OPTIONS;
   }, [availabilityRows, modelRows]);
@@ -190,7 +186,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) {
   // Precisions available for the selected model + sequence
   const availablePrecisions = useMemo(() => {
     if (!availabilityRows) return ['fp4'];
-    const rows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence);
+    const rows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence);
     const precs = [...new Set(rows.map((r) => r.precision))].toSorted();
     return precs.length > 0 ? precs : ['fp4'];
   }, [availabilityRows, modelRows, effectiveSequence]);
@@ -205,7 +201,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) {
   // Dates available for selected model + sequence + precisions
   const availableDates = useMemo(() => {
     if (!availabilityRows) return [];
-    const seqRows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence);
+    const seqRows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence);
     const rows = seqRows.filter((r) => effectivePrecisions.includes(r.precision));
     if (rows.length === 0) {
       return [...new Set(seqRows.map((r) => r.date))].toSorted();
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 7fa416fd..6f45d8d7 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -11,7 +11,7 @@ import {
   useState,
 } from 'react';
 
-import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants';
+import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants';
 import { track } from '@/lib/analytics';
 import { FAVORITE_PRESETS, type FavoritePreset } from '@/components/favorites/favorite-presets';
 
@@ -110,6 +110,11 @@ export function InferenceProvider({
   const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState<string | null>(
     () => getUrlParam('i_e2e_xmetric') || null,
   );
+  // Latency percentile applied to the chart x-axis for agentic scenarios.
+  // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore.
+  const [selectedPercentile, setSelectedPercentile] = useState<string>(
+    () => getUrlParam('i_pctl') || 'median',
+  );
   const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>(
     () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto',
   );
@@ -163,6 +168,7 @@ export function InferenceProvider({
     effectiveRunDate,
     isActive,
     latestDate,
+    selectedPercentile,
   );
 
   // For GPU comparison date picker — use shared availability data from global filters
@@ -176,7 +182,7 @@ export function InferenceProvider({
     if (!availabilityRows) return availableDates;
     const rows = availabilityRows.filter((r) => {
       if (!dbModelKeys.includes(r.model)) return false;
-      if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) return false;
+      if (rowToSequence(r) !== effectiveSequence) return false;
       if (!effectivePrecisions.includes(r.precision)) return false;
       if (!r.hardware) return false;
       const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg);
@@ -201,7 +207,7 @@ export function InferenceProvider({
     const hwKeys = new Set<string>();
     for (const r of availabilityRows) {
       if (!dbModelKeys.includes(r.model)) continue;
-      if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) continue;
+      if (rowToSequence(r) !== effectiveSequence) continue;
       if (!effectivePrecisions.includes(r.precision)) continue;
       if (!r.hardware) continue;
       const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg);
@@ -589,6 +595,7 @@ export function InferenceProvider({
   useUrlStateSync(
     {
       i_metric: selectedYAxisMetric,
+      i_pctl: selectedPercentile,
       i_gpus: selectedGPUs.join(','),
       i_dates: selectedDates.join(','),
       i_dstart: selectedDateRange.startDate,
@@ -783,6 +790,8 @@ export function InferenceProvider({
       workflowInfo,
       selectedYAxisMetric,
       setSelectedYAxisMetric: setSelectedYAxisMetricAndClear,
+      selectedPercentile,
+      setSelectedPercentile,
       selectedGPUs,
       setSelectedGPUs: setSelectedGPUsAndClear,
       availableGPUs,
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 625e63ab..81ab0780 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -1,7 +1,7 @@
 import { useMemo, useRef } from 'react';
 
 import { useQueries } from '@tanstack/react-query';
-import { sequenceToIslOsl } from '@semianalysisai/inferencex-constants';
+import { rowToSequence } from '@semianalysisai/inferencex-constants';
 
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
 import type {
@@ -15,7 +15,7 @@ import type {
 import { filterDataByCostLimit } from '@/components/inference/utils';
 import { useBenchmarks, benchmarkQueryOptions } from '@/hooks/api/use-benchmarks';
 import { GPU_ALIAS_TO_CANONICAL, getModelSortIndex } from '@/lib/constants';
-import { transformBenchmarkRows } from '@/lib/benchmark-transform';
+import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform';
 import type { Model, Sequence } from '@/lib/data-mappings';
 import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils';
 
@@ -79,6 +79,7 @@ export function useChartData(
   selectedRunDate?: string,
   enabled = true,
   latestAvailableDate?: string,
+  selectedPercentile = 'median',
 ) {
   // When the selected date is the latest available, use '' (empty string) to match
   // the initial no-date query key, reusing the eagerly-fetched benchmarks from the
@@ -119,11 +120,13 @@ export function useChartData(
   // Merge main rows with comparison date rows.
   // Stamp each row with the *requested* date (not the actual DB date) so that
   // GPUGraph's activeDates filter (keyed by user-selected date) matches the points.
-  const sequenceIslOsl = useMemo(() => sequenceToIslOsl(selectedSequence), [selectedSequence]);
+  //
+  // rowToSequence handles both fixed-seq (via isl/osl) and agentic (via
+  // benchmark_type), so one filter covers every scenario.
   const rows = useMemo(() => {
-    if (!allRows || !sequenceIslOsl) return [];
-    const seqFilter = (r: { isl: number; osl: number }) =>
-      r.isl === sequenceIslOsl.isl && r.osl === sequenceIslOsl.osl;
+    if (!allRows) return [];
+    const seqFilter = (r: { isl: number | null; osl: number | null; benchmark_type: string }) =>
+      rowToSequence(r) === selectedSequence;
     const seqFiltered = allRows.filter(seqFilter);
 
     // For each (hw, framework, spec_method, disagg, precision) group, keep only
@@ -150,14 +153,14 @@ export function useChartData(
         .map((r) => ({ ...r, date: comparisonDates[i], actualDate: r.date })),
     );
     return [...mainRows, ...extraRows];
-  }, [allRows, sequenceIslOsl, comparisonDates, comparisonDataKey, selectedRunDate]);
+  }, [allRows, selectedSequence, comparisonDates, comparisonDataKey, selectedRunDate]);
 
   // Transform filtered rows into chart data
   const { chartData, hardwareConfig: rawHardwareConfig } = useMemo(() => {
     if (rows.length === 0)
       return { chartData: [] as InferenceData[][], hardwareConfig: {} as HardwareConfig };
-    return transformBenchmarkRows(rows);
-  }, [rows]);
+    return transformBenchmarkRows(rows, selectedPercentile);
+  }, [rows, selectedPercentile]);
 
   // Sort hardware config — stabilize reference when keys haven't changed.
   // Different sequences for the same model often have the same GPU configs,
@@ -192,8 +195,11 @@ export function useChartData(
       (chartDefinitions as ChartDefinition[]).map((chartDef) => {
         const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
 
-        // Determine dynamic x-axis
-        let xAxisField: keyof AggDataEntry = chartDef.x;
+        // Default x-axis = chart's natural latency metric, percentile-adjusted
+        // for the agentic case (median_e2el → p99_e2el etc.). For non-agentic
+        // scenarios `withPercentile` is a no-op when percentile === 'median'.
+        const naturalX = withPercentile(chartDef.x, selectedPercentile) as keyof AggDataEntry;
+        let xAxisField: keyof AggDataEntry = naturalX;
         let xAxisLabel = chartDef.x_label;
 
         const metricTitle =
@@ -232,8 +238,10 @@ export function useChartData(
         // (e.g. interactivity → TTFT: "higher is better" → "lower is better").
         // E2EL → TTFT keeps the same direction ("lower is better" for both),
         // so no roofline flip is needed for the e2e chart.
+        // Compare against `naturalX` (percentile-adjusted) — switching the
+        // percentile of the same logical metric is NOT a flip.
         const xAxisFlipped =
-          xAxisField !== chartDef.x && !(chartDef.chartType === 'e2e' && isTtftOverride);
+          xAxisField !== naturalX && !(chartDef.chartType === 'e2e' && isTtftOverride);
 
         const yLabelKey = `${selectedYAxisMetric}_label` as keyof ChartDefinition;
         const dynamicYLabel = chartDef[yLabelKey];
@@ -261,7 +269,7 @@ export function useChartData(
           xAxisField,
         };
       }),
-    [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric],
+    [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, selectedPercentile],
   );
 
   // Build renderable graphs (data processing + stable chart definitions)
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index a23707ba..53c8d84c 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -88,6 +88,29 @@ export interface AggDataEntry {
   actualDate?: string;
   /** URL to the GitHub Actions workflow run that produced this data point. */
   run_url?: string;
+  /** Benchmark scenario: `single_turn` (fixed-seq isl/osl) or `agentic_traces`. */
+  benchmark_type?: string;
+  /** ISL in tokens — null for agentic_traces. */
+  isl?: number | null;
+  /** OSL in tokens — null for agentic_traces. */
+  osl?: number | null;
+  // ── Agentic-only fields (populated from metrics JSONB for `agentic_traces` rows) ──
+  /** "on" | "off" — whether KV cache offload to CPU was enabled. */
+  offload_mode?: string;
+  /** Actual server-observed GPU prefix-cache hit rate (0..1). */
+  server_gpu_cache_hit_rate?: number;
+  /** Actual server-observed CPU prefix-cache hit rate (0..1). */
+  server_cpu_cache_hit_rate?: number;
+  /** Infinite-cache theoretical hit rate (0..1) computed from trace. */
+  theoretical_cache_hit_rate?: number;
+  /** Total requests attempted during the window. */
+  num_requests_total?: number;
+  /** Requests that completed successfully. */
+  num_requests_successful?: number;
+  /** Total prompt tokens served. */
+  total_prompt_tokens?: number;
+  /** Total generated (output) tokens. */
+  total_generation_tokens?: number;
 }
 
 /**
@@ -468,6 +491,9 @@ export interface InferenceChartContextType {
   workflowInfo: any;
   selectedYAxisMetric: string;
   setSelectedYAxisMetric: (metric: string) => void;
+  /** Latency percentile for the x-axis under agentic scenarios (median/p90/p99/p99.9). */
+  selectedPercentile: string;
+  setSelectedPercentile: (p: string) => void;
   selectedXAxisMetric: string | null;
   setSelectedXAxisMetric: (metric: string | null) => void;
   selectedE2eXAxisMetric: string | null;
diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx
index 5f8e7787..e4f55ad7 100644
--- a/packages/app/src/components/inference/ui/ChartControls.tsx
+++ b/packages/app/src/components/inference/ui/ChartControls.tsx
@@ -1,11 +1,14 @@
 'use client';
 
+import { useEffect, useState } from 'react';
+
 import { track } from '@/lib/analytics';
 
 import { useInference } from '@/components/inference/InferenceContext';
 import {
   ModelSelector,
-  SequenceSelector,
+  ScenarioSelector,
+  PercentileSelector,
   PrecisionSelector,
 } from '@/components/ui/chart-selectors';
 import { DateRangePicker } from '@/components/ui/date-range-picker';
@@ -23,7 +26,7 @@ import {
 import { TooltipProvider } from '@/components/ui/tooltip';
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
 import type { ChartDefinition } from '@/components/inference/types';
-import type { Model, Sequence } from '@/lib/data-mappings';
+import { Sequence, type Model, type Percentile } from '@/lib/data-mappings';
 
 // Build Y-axis metric options from static chart config JSON — available immediately, no API wait
 const METRIC_GROUPS = [
@@ -78,6 +81,13 @@ interface ChartControlsProps {
 }
 
 export default function ChartControls({ hideGpuComparison = false }: ChartControlsProps) {
+  // The percentile selector is rendered conditionally on `selectedSequence`,
+  // which on the client is hydrated from URL params. SSR doesn't see the URL,
+  // so deferring the conditional until after mount keeps the initial DOM
+  // identical between server and client (avoids hydration warnings).
+  const [mounted, setMounted] = useState(false);
+  useEffect(() => setMounted(true), []);
+
   const {
     selectedModel,
     setSelectedModel,
@@ -87,6 +97,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
     setSelectedPrecisions,
     selectedYAxisMetric,
     setSelectedYAxisMetric,
+    selectedPercentile,
+    setSelectedPercentile,
     graphs,
     selectedGPUs,
     setSelectedGPUs,
@@ -203,12 +215,19 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
             availableModels={availableModels}
             data-testid="model-selector"
           />
-          <SequenceSelector
+          <ScenarioSelector
             value={selectedSequence}
             onChange={handleSequenceChange}
             availableSequences={availableSequences}
-            data-testid="sequence-selector"
+            data-testid="scenario-selector"
           />
+          {mounted && selectedSequence === Sequence.AgenticTraces && (
+            <PercentileSelector
+              value={selectedPercentile}
+              onChange={(p: Percentile) => setSelectedPercentile(p)}
+              data-testid="percentile-selector"
+            />
+          )}
           <PrecisionSelector
             value={selectedPrecisions}
             onChange={handlePrecisionChange}
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 2e078f89..15bb60f0 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -1512,6 +1512,24 @@ const ScatterGraph = React.memo(
             .attr('pointer-events', 'none');
         });
 
+        // Offload halo: dashed ring on frontier points that used KV offload
+        zoomGroup.selectAll<SVGGElement, InferenceData>('.dot-group').each(function (d) {
+          const onFrontier = optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`);
+          const showHalo = onFrontier && d.offload_mode === 'on';
+          d3.select(this)
+            .selectAll<SVGCircleElement, boolean>('.offload-halo')
+            .data(showHalo ? [true] : [])
+            .join('circle')
+            .attr('class', 'offload-halo')
+            .attr('r', POINT_SIZE + 4)
+            .attr('fill', 'none')
+            .attr('stroke', 'var(--foreground)')
+            .attr('stroke-width', 1.5)
+            .attr('stroke-dasharray', '3 2')
+            .attr('opacity', 0.9)
+            .attr('pointer-events', 'none');
+        });
+
         // Double-click to track/untrack
         zoomGroup
           .selectAll<SVGGElement, InferenceData>('.dot-group')
@@ -1567,6 +1585,9 @@ const ScatterGraph = React.memo(
         chartDefinition.chartType,
         xScaleConfig._isLog,
         yScaleConfig.type,
+        optimalPointKeys,
+        getCssColor,
+        resolveColor,
       ],
     );
 
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index e88e9930..7391225e 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -88,6 +88,51 @@ const runLinkHTML = (runUrl?: string) =>
 const tooltipLine = (label: string, value: string | number) =>
   `<div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;"><strong>${label}:</strong> ${value}</div>`;
 
+const formatPct = (v: number | undefined): string | null =>
+  v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`;
+
+/**
+ * Agentic-only tooltip rows: offload mode, KV cache hit rates, request
+ * success, token totals. Returns an empty string for non-agentic rows.
+ */
+const generateAgenticHTML = (d: InferenceData): string => {
+  if (d.benchmark_type !== 'agentic_traces') return '';
+
+  const parts: string[] = [];
+  if (d.offload_mode) {
+    parts.push(tooltipLine('Offload Mode', d.offload_mode.toUpperCase()));
+  }
+
+  const gpuHit = formatPct(d.server_gpu_cache_hit_rate);
+  const cpuHit = formatPct(d.server_cpu_cache_hit_rate);
+  const theoHit = formatPct(d.theoretical_cache_hit_rate);
+  if (gpuHit) parts.push(tooltipLine('GPU Cache Hit Rate', gpuHit));
+  if (cpuHit) parts.push(tooltipLine('CPU Cache Hit Rate', cpuHit));
+  if (theoHit) parts.push(tooltipLine('Theoretical Cache Hit Rate', theoHit));
+
+  if (d.num_requests_total !== undefined && d.num_requests_successful !== undefined) {
+    const successPct =
+      d.num_requests_total > 0
+        ? ` (${((d.num_requests_successful / d.num_requests_total) * 100).toFixed(0)}%)`
+        : '';
+    parts.push(
+      tooltipLine(
+        'Requests',
+        `${d.num_requests_successful} / ${d.num_requests_total}${successPct}`,
+      ),
+    );
+  }
+
+  if (d.total_prompt_tokens !== undefined) {
+    parts.push(tooltipLine('Prompt Tokens', formatNumber(d.total_prompt_tokens)));
+  }
+  if (d.total_generation_tokens !== undefined) {
+    parts.push(tooltipLine('Generated Tokens', formatNumber(d.total_generation_tokens)));
+  }
+
+  return parts.join('');
+};
+
 /**
  * Generates HTML for the parallelism configuration section of a tooltip.
  * Falls back to GPU count for old data without parallelism fields.
@@ -177,9 +222,10 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Concurrency:</strong> ${d.conc}
       </div>
-      <div style="color: var(--muted-foreground); font-size: 11px;">
+      <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Precision:</strong> ${d.precision.toUpperCase()}
       </div>
+      ${generateAgenticHTML(d)}
       ${runLinkHTML(runUrl)}
       ${
         isPinned
@@ -231,9 +277,10 @@ export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): str
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Concurrency:</strong> ${d.conc}
       </div>
-      <div style="color: var(--muted-foreground); font-size: 11px;">
+      <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Precision:</strong> ${d.precision.toUpperCase()}
       </div>
+      ${generateAgenticHTML(d)}
     </div>
   `;
 };
@@ -292,9 +339,10 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Concurrency:</strong> ${d.conc}
       </div>
-      <div style="color: var(--muted-foreground); font-size: 11px;">
+      <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Precision:</strong> ${d.precision.toUpperCase()}
       </div>
+      ${generateAgenticHTML(d)}
       ${runLinkHTML(runUrl)}
     </div>
   `;
diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index 75e2f257..1c843e12 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -19,12 +19,16 @@ import {
   type Model,
   type Precision,
   type Sequence,
+  type Percentile,
+  PERCENTILE_OPTIONS,
   getModelCategory,
   getModelLabel,
+  getPercentileLabel,
   getPrecisionLabel,
   getSequenceCategory,
   getSequenceLabel,
   groupByCategory,
+  sequenceKind,
 } from '@/lib/data-mappings';
 
 function DeprecatedLabel({ reason }: { reason: string }) {
@@ -167,6 +171,126 @@ export function SequenceSelector({
   );
 }
 
+interface ScenarioSelectorProps {
+  id?: string;
+  value: string;
+  onChange: (value: Sequence) => void;
+  availableSequences: string[];
+  'data-testid'?: string;
+}
+
+/**
+ * Scenario selector — fixed-seq-len rows grouped under "Fixed Sequence Length",
+ * agentic-trace rows rendered flat below. Label is "Scenario" (the ISL/OSL
+ * framing only applies to the fixed-seq subset).
+ */
+export function ScenarioSelector({
+  id = 'scenario-select',
+  value,
+  onChange,
+  availableSequences,
+  'data-testid': testId,
+}: ScenarioSelectorProps) {
+  const fixedSeq = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'fixed-seq');
+  const agentic = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'agentic');
+  const fixedGroups = groupByCategory(fixedSeq, (s) => getSequenceCategory(s as Sequence));
+
+  return (
+    <div className="flex flex-col space-y-1.5 lg:col-span-1">
+      <LabelWithTooltip
+        htmlFor={id}
+        label="Scenario"
+        tooltip="Benchmark scenario. Fixed Sequence Length runs use a defined input/output token count (ISL/OSL). Agentic Traces replay real agentic workloads with variable inputs/outputs."
+      />
+      <Select
+        value={value}
+        onValueChange={(v) => {
+          track('selector_scenario_changed', { scenario: v });
+          onChange(v as Sequence);
+        }}
+      >
+        <SelectTrigger id={id} data-testid={testId} className="w-full">
+          <SelectValue />
+        </SelectTrigger>
+        <SelectContent>
+          {fixedSeq.length > 0 && (
+            <SelectGroup>
+              <SelectLabel>Fixed Sequence Length</SelectLabel>
+              {fixedGroups.default.map((seq) => (
+                <SelectItem key={seq} value={seq}>
+                  {getSequenceLabel(seq as Sequence)}
+                </SelectItem>
+              ))}
+              {fixedGroups.deprecated.length > 0 && (
+                <>
+                  <DeprecatedLabel reason="CI capacity was reallocated to agentic coding and multi-turn chat scenarios." />
+                  {fixedGroups.deprecated.map((seq) => (
+                    <SelectItem key={seq} value={seq}>
+                      {getSequenceLabel(seq as Sequence)}
+                    </SelectItem>
+                  ))}
+                </>
+              )}
+            </SelectGroup>
+          )}
+          {agentic.map((seq) => (
+            <SelectItem key={seq} value={seq}>
+              {getSequenceLabel(seq as Sequence)}
+            </SelectItem>
+          ))}
+        </SelectContent>
+      </Select>
+    </div>
+  );
+}
+
+interface PercentileSelectorProps {
+  id?: string;
+  value: string;
+  onChange: (value: Percentile) => void;
+  'data-testid'?: string;
+}
+
+/**
+ * Latency percentile selector for agentic-trace charts. The selected value
+ * rewrites the chart x-axis metric from `median_*` to `{percentile}_*`, so
+ * picking p99 plots p99 e2e latency / interactivity instead of the median.
+ */
+export function PercentileSelector({
+  id = 'percentile-select',
+  value,
+  onChange,
+  'data-testid': testId,
+}: PercentileSelectorProps) {
+  return (
+    <div className="flex flex-col space-y-1.5 lg:col-span-1">
+      <LabelWithTooltip
+        htmlFor={id}
+        label="Latency Percentile"
+        tooltip="Percentile of the latency distribution used for the chart x-axis. Agentic runs carry median/p90/p99/p99.9 variants; switch percentiles to see tail-latency behavior."
+      />
+      <Select
+        value={value}
+        onValueChange={(v) => {
+          track('selector_percentile_changed', { percentile: v });
+          onChange(v as Percentile);
+        }}
+      >
+        <SelectTrigger id={id} data-testid={testId} className="w-full">
+          <SelectValue />
+        </SelectTrigger>
+        <SelectContent>
+          {PERCENTILE_OPTIONS.map((p) => (
+            <SelectItem key={p} value={p}>
+              {getPercentileLabel(p)}
+            </SelectItem>
+          ))}
+        </SelectContent>
+      </Select>
+    </div>
+  );
+}
+
 interface PrecisionSelectorProps {
   id?: string;
   value: string[];
diff --git a/packages/app/src/components/unofficial-run-provider.test.ts b/packages/app/src/components/unofficial-run-provider.test.ts
index f4263d2c..05b522c5 100644
--- a/packages/app/src/components/unofficial-run-provider.test.ts
+++ b/packages/app/src/components/unofficial-run-provider.test.ts
@@ -29,6 +29,8 @@ function stubRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
     decode_num_workers: 0,
     num_prefill_gpu: 8,
     num_decode_gpu: 8,
+    benchmark_type: 'single_turn',
+    offload_mode: 'off',
     isl: 1024,
     osl: 1024,
     conc: 128,
diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx
index 2dccdf7f..42530a51 100644
--- a/packages/app/src/components/unofficial-run-provider.tsx
+++ b/packages/app/src/components/unofficial-run-provider.tsx
@@ -12,7 +12,7 @@ import {
 
 import type { ChartDefinition, HardwareConfig, InferenceData } from '@/components/inference/types';
 import { UnofficialBanner } from '@/components/ui/unofficial-banner';
-import { DB_MODEL_TO_DISPLAY, islOslToSequence } from '@semianalysisai/inferencex-constants';
+import { DB_MODEL_TO_DISPLAY, rowToSequence } from '@semianalysisai/inferencex-constants';
 import { computeToggle } from '@/hooks/useTogglableSet';
 import type { BenchmarkRow, EvalRow } from '@/lib/api';
 import { normalizeEvalHardwareKey } from '@/lib/chart-utils';
@@ -93,7 +93,7 @@ export function buildChartData(benchmarks: BenchmarkRow[]): UnofficialChartData
   const groups = new Map<string, BenchmarkRow[]>();
   for (const row of benchmarks) {
     const displayModel = DB_MODEL_TO_DISPLAY[row.model] ?? row.model;
-    const sequence = islOslToSequence(row.isl, row.osl);
+    const sequence = rowToSequence(row);
     if (!sequence) continue;
     const key = `${displayModel}_${sequence}`;
     if (!groups.has(key)) groups.set(key, []);
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
index 11ba4521..240251c3 100644
--- a/packages/app/src/lib/api.ts
+++ b/packages/app/src/lib/api.ts
@@ -23,9 +23,13 @@ export interface BenchmarkRow {
   decode_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
-  isl: number;
-  osl: number;
+  benchmark_type: string;
+  // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */
+  offload_mode: string;
   image: string | null;
   metrics: Record<string, number>;
   date: string;
@@ -140,13 +144,15 @@ export function fetchWorkflowInfo(date: string, signal?: AbortSignal) {
 
 export interface AvailabilityRow {
   model: string;
-  isl: number;
-  osl: number;
+  // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+  isl: number | null;
+  osl: number | null;
   precision: string;
   hardware: string;
   framework: string;
   spec_method: string;
   disagg: boolean;
+  benchmark_type: string;
   date: string;
 }
 
diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts
index be76438e..6a6c97c8 100644
--- a/packages/app/src/lib/benchmark-transform.test.ts
+++ b/packages/app/src/lib/benchmark-transform.test.ts
@@ -23,6 +23,8 @@ function makeRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
     decode_num_workers: 0,
     num_prefill_gpu: 8,
     num_decode_gpu: 8,
+    benchmark_type: 'single_turn',
+    offload_mode: 'off',
     isl: 1024,
     osl: 1024,
     conc: 64,
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index 107f0b12..69745da2 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -15,9 +15,39 @@ import { createChartDataPoint, getHardwareKey } from '@/lib/chart-utils';
 import { getHardwareConfig } from '@/lib/constants';
 import type { BenchmarkRow } from '@/lib/api';
 
+/**
+ * Agentic trace-replay runs (`benchmark_type === 'agentic_traces'`) emit ttft/ttlt/itl
+ * but not the intvty/e2el/tpot keys the chart pipeline expects. Bridge them here:
+ *   e2el   ≡ ttlt   (time-to-last-token == end-to-end latency)
+ *   tpot   ≡ itl    (time-per-output-token == inter-token-latency for single-output)
+ *   intvty ≡ 1/itl  (tok/s from the user's perspective)
+ * Existing fields win if present; we only fill in the gaps.
+ */
+function agenticAliases(m: Record<string, number>): Record<string, number> {
+  const out: Record<string, number> = {};
+  for (const suffix of ['mean', 'median', 'p90', 'p99']) {
+    const itl = m[`${suffix}_itl`];
+    const ttlt = m[`${suffix}_ttlt`];
+    if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt;
+    if (m[`${suffix}_tpot`] === undefined && itl !== undefined) out[`${suffix}_tpot`] = itl;
+    if (m[`${suffix}_intvty`] === undefined && itl !== undefined && itl > 0) {
+      out[`${suffix}_intvty`] = 1 / itl;
+    }
+  }
+  return out;
+}
+
 /** Convert a DB benchmark row to an AggDataEntry. */
 export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
-  const m = row.metrics;
+  const isAgentic = row.benchmark_type === 'agentic_traces';
+  const m = isAgentic ? { ...row.metrics, ...agenticAliases(row.metrics) } : row.metrics;
+  // Prefer the dedicated column (added in migration 004); fall back to the
+  // legacy stash inside `metrics` for any rows ingested before that column
+  // existed.
+  const rawMetrics = row.metrics as Record<string, unknown>;
+  const offloadMode =
+    row.offload_mode ??
+    (typeof rawMetrics.offload_mode === 'string' ? rawMetrics.offload_mode : undefined);
   return {
     hw: row.hardware,
     framework: row.framework,
@@ -68,6 +98,17 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
     date: row.date,
     actualDate: (row as any).actualDate ?? row.date,
     run_url: row.run_url ?? undefined,
+    benchmark_type: row.benchmark_type,
+    isl: row.isl,
+    osl: row.osl,
+    offload_mode: offloadMode,
+    server_gpu_cache_hit_rate: m.server_gpu_cache_hit_rate,
+    server_cpu_cache_hit_rate: m.server_cpu_cache_hit_rate,
+    theoretical_cache_hit_rate: m.theoretical_cache_hit_rate,
+    num_requests_total: m.num_requests_total,
+    num_requests_successful: m.num_requests_successful,
+    total_prompt_tokens: m.total_prompt_tokens,
+    total_generation_tokens: m.total_generation_tokens,
   };
 }
 
@@ -77,13 +118,30 @@ interface PreparedEntry {
   date: string;
 }
 
+/**
+ * Rewrite a chart x-axis key to use a different latency percentile prefix
+ * (`median_` → `p99_` etc). Only touches keys that start with a known
+ * percentile prefix; leaves everything else alone.
+ */
+export function withPercentile(key: string, percentile: string): string {
+  return key.replace(/^(mean|median|p90|p99|p99\.9)_/, `${percentile}_`);
+}
+
 /**
  * Transform raw BenchmarkRow[] into chart-ready InferenceData[][] and HardwareConfig.
  * Returns one InferenceData[] per chart definition (e2e, interactivity).
  *
  * Converts rows to AggDataEntry once, then reuses for each chart definition.
+ *
+ * @param percentile Optional latency percentile for the chart x-axis
+ *   (default 'median'). Swaps `median_intvty`/`median_e2el` in the chart
+ *   definition for the chosen percentile — only agentic rows carry the
+ *   full set (median/p90/p99/p99.9) so this mainly affects that scenario.
  */
-export function transformBenchmarkRows(rows: BenchmarkRow[]): {
+export function transformBenchmarkRows(
+  rows: BenchmarkRow[],
+  percentile = 'median',
+): {
   chartData: InferenceData[][];
   hardwareConfig: HardwareConfig;
 } {
@@ -109,13 +167,14 @@ export function transformBenchmarkRows(rows: BenchmarkRow[]): {
 
   // Phase 2: Build chart data per chart definition (reusing prepared entries)
   const chartData = (chartDefinitions as ChartDefinition[]).map((chartDef) => {
+    const xKey = withPercentile(chartDef.x, percentile);
     const groupedByHw: Record<string, InferenceData[]> = {};
 
     for (const { entry, hwKey, date } of prepared) {
       const dataPoint = createChartDataPoint(
         date,
         entry,
-        chartDef.x as keyof AggDataEntry,
+        xKey as keyof AggDataEntry,
         chartDef.y as keyof AggDataEntry,
         hwKey,
       );
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 823b6823..8900f50e 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -102,17 +102,77 @@ export enum Sequence {
   OneK_OneK = '1k/1k',
   OneK_EightK = '1k/8k',
   EightK_OneK = '8k/1k',
+  AgenticTraces = 'agentic-traces',
 }
 
-const SEQUENCE_CONFIG: Record<Sequence, { label: string; compact: string; category: CategoryTag }> =
-  {
-    [Sequence.OneK_OneK]: { label: '1K / 1K', compact: '1k1k', category: 'default' },
-    [Sequence.OneK_EightK]: { label: '1K / 8K', compact: '1k8k', category: 'deprecated' },
-    [Sequence.EightK_OneK]: { label: '8K / 1K', compact: '8k1k', category: 'default' },
-  };
+/**
+ * Top-level scenario kind. Fixed-seq sequences cluster under a single group
+ * in the selector; agentic traces sit alongside as their own kind.
+ */
+export type ScenarioKind = 'fixed-seq' | 'agentic';
+
+export function sequenceKind(seq: Sequence): ScenarioKind {
+  return seq === Sequence.AgenticTraces ? 'agentic' : 'fixed-seq';
+}
+
+const SEQUENCE_CONFIG: Record<
+  Sequence,
+  { label: string; compact: string; category: CategoryTag; kind: ScenarioKind }
+> = {
+  [Sequence.OneK_OneK]: {
+    label: '1K / 1K',
+    compact: '1k1k',
+    category: 'default',
+    kind: 'fixed-seq',
+  },
+  [Sequence.OneK_EightK]: {
+    label: '1K / 8K',
+    compact: '1k8k',
+    category: 'deprecated',
+    kind: 'fixed-seq',
+  },
+  [Sequence.EightK_OneK]: {
+    label: '8K / 1K',
+    compact: '8k1k',
+    category: 'default',
+    kind: 'fixed-seq',
+  },
+  [Sequence.AgenticTraces]: {
+    label: 'Agentic Traces',
+    compact: 'agentic',
+    category: 'default',
+    kind: 'agentic',
+  },
+};
 
 export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 
+/**
+ * Percentile of the latency distribution used for the chart x-axis when
+ * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
+ * for ttft, ttlt (=e2el), and itl (and intvty derived from itl) — pick which
+ * slice to plot.
+ */
+export enum Percentile {
+  Median = 'median',
+  P90 = 'p90',
+  P99 = 'p99',
+  P99_9 = 'p99.9',
+}
+
+const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
+  [Percentile.Median]: { label: 'p50 (median)' },
+  [Percentile.P90]: { label: 'p90' },
+  [Percentile.P99]: { label: 'p99' },
+  [Percentile.P99_9]: { label: 'p99.9' },
+};
+
+export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];
+
+export function getPercentileLabel(p: Percentile): string {
+  return PERCENTILE_CONFIG[p]?.label ?? p;
+}
+
 export const DEPRECATED_SEQUENCES: ReadonlySet<Sequence> = new Set(
   (Object.entries(SEQUENCE_CONFIG) as [Sequence, (typeof SEQUENCE_CONFIG)[Sequence]][])
     .filter(([, c]) => c.category === 'deprecated')
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index 3947488f..fb2e9d70 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -22,6 +22,7 @@ const URL_STATE_KEYS = [
   'i_seq',
   'i_prec',
   'i_metric',
+  'i_pctl',
   'i_xmetric',
   'i_e2e_xmetric',
   'i_scale',
@@ -61,6 +62,7 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   i_seq: '8k/1k',
   i_prec: 'fp4',
   i_metric: 'y_tpPerGpu',
+  i_pctl: 'median',
   i_xmetric: 'p99_ttft',
   i_e2e_xmetric: '',
   i_scale: 'auto',
diff --git a/packages/constants/src/models.ts b/packages/constants/src/models.ts
index 6d646f08..d9a3d2d1 100644
--- a/packages/constants/src/models.ts
+++ b/packages/constants/src/models.ts
@@ -53,3 +53,20 @@ export function islOslToSequence(isl: number, osl: number): string | null {
   };
   return map[`${isl}_${osl}`] ?? null;
 }
+
+/**
+ * Map a benchmark/availability row to its sequence (scenario) string.
+ * - `agentic_traces` rows map to `'agentic-traces'` regardless of isl/osl.
+ * - Other rows (today: `single_turn`) fall back to `islOslToSequence`.
+ * Returns `null` for rows that can't be classified (e.g. `single_turn` with
+ * unmapped isl/osl values).
+ */
+export function rowToSequence(row: {
+  isl: number | null;
+  osl: number | null;
+  benchmark_type: string;
+}): string | null {
+  if (row.benchmark_type === 'agentic_traces') return 'agentic-traces';
+  if (row.isl === null || row.osl === null) return null;
+  return islOslToSequence(row.isl, row.osl);
+}
diff --git a/packages/db/migrations/002_agentic_scenario.sql b/packages/db/migrations/002_agentic_scenario.sql
new file mode 100644
index 00000000..c143914e
--- /dev/null
+++ b/packages/db/migrations/002_agentic_scenario.sql
@@ -0,0 +1,30 @@
+-- Support agentic scenarios in benchmark_results.
+--
+-- Scenarios are discriminated by benchmark_type:
+--   'single_turn'     — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set.
+--   'agentic_traces'  — trace-replay agentic runs. isl/osl NULL.
+--
+-- conc retains its meaning (concurrent users/requests) for both.
+
+-- 1) isl/osl become nullable for agentic rows
+alter table benchmark_results
+  alter column isl drop not null,
+  alter column osl drop not null;
+
+-- 2) CHECK constraints: positive-or-null
+alter table benchmark_results
+  drop constraint benchmark_results_isl_positive,
+  drop constraint benchmark_results_osl_positive;
+
+alter table benchmark_results
+  add constraint benchmark_results_isl_positive check (isl is null or isl > 0),
+  add constraint benchmark_results_osl_positive check (osl is null or osl > 0);
+
+-- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows
+--    can't duplicate on (workflow_run_id, config_id, benchmark_type, conc).
+alter table benchmark_results
+  drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+  add constraint benchmark_results_unique unique nulls not distinct
+    (workflow_run_id, config_id, benchmark_type, isl, osl, conc);
diff --git a/packages/db/migrations/003_agentic_availability.sql b/packages/db/migrations/003_agentic_availability.sql
new file mode 100644
index 00000000..e96cbd50
--- /dev/null
+++ b/packages/db/migrations/003_agentic_availability.sql
@@ -0,0 +1,21 @@
+-- Extend the availability table to cover agentic scenarios.
+--
+-- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same
+-- for availability and add benchmark_type so the frontend can enumerate
+-- agentic vs single_turn scenarios per model/date.
+--
+-- Postgres primary keys require every column to be NOT NULL, so we drop the PK
+-- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally
+-- equivalent except it allows isl/osl to be NULL for agentic rows.
+
+alter table availability
+  drop constraint availability_pkey;
+
+alter table availability
+  alter column isl drop not null,
+  alter column osl drop not null,
+  add column benchmark_type text not null default 'single_turn';
+
+alter table availability
+  add constraint availability_natural_key unique nulls not distinct
+    (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date);
diff --git a/packages/db/migrations/004_offload_mode.sql b/packages/db/migrations/004_offload_mode.sql
new file mode 100644
index 00000000..24b617f1
--- /dev/null
+++ b/packages/db/migrations/004_offload_mode.sql
@@ -0,0 +1,42 @@
+-- Add offload_mode as a first-class dimension on benchmark_results.
+--
+-- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace
+-- runs: a single run may emit two rows for the same (config, isl, osl, conc)
+-- — one with offload disabled, one enabled. The pre-existing unique key
+-- collapsed those into one row, forcing the ingest to skip variants.
+--
+-- For fixed-seq runs `offload_mode` defaults to 'off', which matches the
+-- assumption baked into the existing 5,500+ rows.
+
+alter table benchmark_results
+  add column offload_mode text not null default 'off';
+
+-- Backfill agentic rows from the offload_mode value already living in metrics
+-- JSONB (set during the earlier agentic ingest backfill).
+update benchmark_results
+   set offload_mode = metrics->>'offload_mode'
+ where benchmark_type = 'agentic_traces'
+   and metrics ? 'offload_mode';
+
+-- Replace the unique constraint so on/off variants can coexist.
+alter table benchmark_results
+  drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+  add constraint benchmark_results_unique unique nulls not distinct
+    (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode);
+
+-- Rebuild the latest-per-config materialized view to dedupe by offload_mode too.
+drop materialized view if exists latest_benchmarks cascade;
+
+create materialized view latest_benchmarks as
+select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode)
+  br.*
+from benchmark_results br
+join latest_workflow_runs wr on wr.id = br.workflow_run_id
+where br.error is null
+order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc;
+
+create unique index latest_benchmarks_pk
+  on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct;
+create index latest_benchmarks_model_idx on latest_benchmarks (config_id);
diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts
index 67173c64..ea802d3f 100644
--- a/packages/db/src/etl/benchmark-ingest.ts
+++ b/packages/db/src/etl/benchmark-ingest.ts
@@ -29,12 +29,19 @@ export async function bulkIngestBenchmarkRows(
 
   // Postgres rejects ON CONFLICT DO UPDATE if the same conflict key appears
   // more than once in a single batch. Deduplicate within the batch, keeping
-  // the last occurrence (last metrics for each unique config/isl/osl/conc).
+  // the last occurrence (last metrics for each unique config/benchmark_type/isl/osl/conc/offload_mode).
   const seen = new Map<string, BenchmarkParams & { configId: number }>();
-  for (const r of rows) seen.set(`${r.configId}-${r.isl}-${r.osl}-${r.conc}`, r);
+  for (const r of rows) {
+    seen.set(
+      `${r.configId}-${r.benchmarkType}-${r.isl ?? ''}-${r.osl ?? ''}-${r.conc}-${r.offloadMode}`,
+      r,
+    );
+  }
   const deduped = [...seen.values()];
 
   const configIds = deduped.map((r) => r.configId);
+  const benchmarkTypes = deduped.map((r) => r.benchmarkType);
+  const offloadModes = deduped.map((r) => r.offloadMode);
   const isls = deduped.map((r) => r.isl);
   const osls = deduped.map((r) => r.osl);
   const concs = deduped.map((r) => r.conc);
@@ -43,20 +50,21 @@ export async function bulkIngestBenchmarkRows(
 
   const result = await sql<{ inserted: boolean; id: number }[]>`
     insert into benchmark_results (
-      workflow_run_id, config_id, benchmark_type, date,
+      workflow_run_id, config_id, benchmark_type, offload_mode, date,
       isl, osl, conc, image, metrics
     )
     select
       ${workflowRunId},
       unnest(${sql.array(configIds)}::int[]),
-      'single_turn',
+      unnest(${sql.array(benchmarkTypes)}::text[]),
+      unnest(${sql.array(offloadModes)}::text[]),
       ${date}::date,
       unnest(${sql.array(isls)}::int[]),
       unnest(${sql.array(osls)}::int[]),
       unnest(${sql.array(concs)}::int[]),
       unnest(${sql.array(images)}),
       unnest(${sql.array(metricsJsons)}::jsonb[])
-    on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc)
+    on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode)
     do update set
       metrics = excluded.metrics,
       image = excluded.image
@@ -147,13 +155,14 @@ export async function bulkUpsertAvailability(
   sql: Sql,
   rows: {
     model: string;
-    isl: number;
-    osl: number;
+    isl: number | null;
+    osl: number | null;
     precision: string;
     hardware: string;
     framework: string;
     specMethod: string;
     disagg: boolean;
+    benchmarkType: string;
   }[],
   date: string,
 ): Promise<void> {
@@ -162,7 +171,7 @@ export async function bulkUpsertAvailability(
   const seen = new Set<string>();
   const unique: typeof rows = [];
   for (const r of rows) {
-    const key = `${r.model}|${r.isl}|${r.osl}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${date}`;
+    const key = `${r.model}|${r.isl ?? ''}|${r.osl ?? ''}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${r.benchmarkType}|${date}`;
     if (!seen.has(key)) {
       seen.add(key);
       unique.push(r);
@@ -170,7 +179,7 @@ export async function bulkUpsertAvailability(
   }
 
   await sql`
-    insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, date)
+    insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date)
     select
       unnest(${sql.array(unique.map((r) => r.model))}::text[]),
       unnest(${sql.array(unique.map((r) => r.isl))}::int[]),
@@ -180,6 +189,7 @@ export async function bulkUpsertAvailability(
       unnest(${sql.array(unique.map((r) => r.framework))}::text[]),
       unnest(${sql.array(unique.map((r) => r.specMethod))}::text[]),
       unnest(${sql.array(unique.map((r) => r.disagg))}::bool[]),
+      unnest(${sql.array(unique.map((r) => r.benchmarkType))}::text[]),
       ${date}::date
     on conflict do nothing
   `;
diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index 7d78e175..5b120843 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -57,8 +57,21 @@ const NON_METRIC_KEYS = new Set([
   'decode_num_workers',
   'num_prefill_gpu',
   'num_decode_gpu',
+  // agentic scenario
+  'scenario_type',
+  'users',
+  'offload_mode',
+  'num_requests_total',
+  'num_requests_successful',
 ]);
 
+/**
+ * `benchmark_type` values understood by the ingest.
+ * - `single_turn`    — fixed sequence-length runs (isl/osl set).
+ * - `agentic_traces` — trace-replay agentic runs (isl/osl null, `users` → conc).
+ */
+export type BenchmarkType = 'single_turn' | 'agentic_traces';
+
 /**
  * METRIC_KEYS from constants is the canonical set of known metric keys.
  * Any numeric field outside this set and `NON_METRIC_KEYS` is auto-captured
@@ -70,9 +83,13 @@ const _warnedMetricKeys = new Set<string>();
 
 export interface BenchmarkParams {
   config: ConfigParams;
-  isl: number;
-  osl: number;
+  benchmarkType: BenchmarkType;
+  // Null for agentic_traces; present for single_turn.
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  /** 'on' | 'off' — KV cache offload to CPU. Defaults to 'off'. */
+  offloadMode: string;
   image: string | null;
   metrics: Record<string, number>;
 }
@@ -114,10 +131,15 @@ export function mapBenchmarkRow(
     return null;
   }
 
-  const isl = parseInt2(row.isl) ?? islOslFallback?.isl;
-  const osl = parseInt2(row.osl) ?? islOslFallback?.osl;
-  const conc = parseInt2(row.conc);
-  if (!isl || !osl || !conc) {
+  // Agentic-trace runs emit `scenario_type: 'agentic-coding'` (and variants),
+  // no isl/osl, and `users` instead of `conc`. Everything else stays as-is.
+  const isAgentic = String(row.scenario_type ?? '').startsWith('agentic');
+  const benchmarkType: BenchmarkType = isAgentic ? 'agentic_traces' : 'single_turn';
+
+  const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null);
+  const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null);
+  const conc = isAgentic ? parseInt2(row.users) : parseInt2(row.conc);
+  if (!conc || (!isAgentic && (!isl || !osl))) {
     tracker.skips.noIslOsl++;
     return null;
   }
@@ -182,6 +204,12 @@ export function mapBenchmarkRow(
     }
   }
 
+  // Agentic rows emit `offload_mode: "on" | "off"` as a string — preserve it
+  // as a stringified metric so the frontend can expose it in tooltips.
+  if (isAgentic && typeof row.offload_mode === 'string') {
+    (metrics as Record<string, unknown>).offload_mode = row.offload_mode;
+  }
+
   // Artifact names encode '/' as '#' to avoid path separators; restore the URI.
   const image = row.image ? String(row.image).replaceAll('#', '/') : null;
 
@@ -205,9 +233,14 @@ export function mapBenchmarkRow(
       numPrefillGpu,
       numDecodeGpu,
     },
+    benchmarkType,
     isl,
     osl,
     conc,
+    offloadMode:
+      typeof row.offload_mode === 'string' && row.offload_mode.length > 0
+        ? row.offload_mode
+        : 'off',
     image,
     metrics,
   };
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index 14c7b4d0..8cce43ca 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -248,13 +248,14 @@ async function main(): Promise<void> {
 
   const availRows: {
     model: string;
-    isl: number;
-    osl: number;
+    isl: number | null;
+    osl: number | null;
     precision: string;
     hardware: string;
     framework: string;
     specMethod: string;
     disagg: boolean;
+    benchmarkType: string;
   }[] = [];
 
   let totalNewBmk = 0,
@@ -367,6 +368,7 @@ async function main(): Promise<void> {
               framework: r.config.framework,
               specMethod: r.config.specMethod,
               disagg: r.config.disagg,
+              benchmarkType: r.benchmarkType,
             });
           }
 
diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts
index e20278d6..6dc604e9 100644
--- a/packages/db/src/ingest-gcs-backup.ts
+++ b/packages/db/src/ingest-gcs-backup.ts
@@ -596,13 +596,14 @@ async function main(): Promise<void> {
     // Upsert availability rows only for successfully resolved configs
     const availRows: {
       model: string;
-      isl: number;
-      osl: number;
+      isl: number | null;
+      osl: number | null;
       precision: string;
       hardware: string;
       framework: string;
       specMethod: string;
       disagg: boolean;
+      benchmarkType: string;
     }[] = [];
     for (const r of allInserted) {
       availRows.push({
@@ -614,6 +615,7 @@ async function main(): Promise<void> {
         framework: r.config.framework,
         specMethod: r.config.specMethod,
         disagg: r.config.disagg,
+        benchmarkType: r.benchmarkType,
       });
     }
     if (availRows.length > 0) {
diff --git a/packages/db/src/ingest-supplemental.ts b/packages/db/src/ingest-supplemental.ts
index 1e494e9f..43aae047 100644
--- a/packages/db/src/ingest-supplemental.ts
+++ b/packages/db/src/ingest-supplemental.ts
@@ -219,8 +219,10 @@ async function ingestSupplementalBmk(
 
     const rows: {
       configId: number;
-      isl: number;
-      osl: number;
+      benchmarkType: 'single_turn' | 'agentic_traces';
+      offloadMode: string;
+      isl: number | null;
+      osl: number | null;
       conc: number;
       image: string | null;
       metrics: Record<string, number>;
@@ -271,6 +273,8 @@ async function ingestSupplementalBmk(
 
       rows.push({
         configId,
+        benchmarkType: 'single_turn',
+        offloadMode: 'off',
         isl: entry.isl,
         osl: entry.osl,
         conc: entry.conc,
@@ -294,13 +298,14 @@ async function ingestSupplementalBmk(
     // to `rows` are exactly the valid ones.
     const availRows: {
       model: string;
-      isl: number;
-      osl: number;
+      isl: number | null;
+      osl: number | null;
       precision: string;
       hardware: string;
       framework: string;
       specMethod: string;
       disagg: boolean;
+      benchmarkType: string;
     }[] = [];
     for (const entry of entries) {
       const modelKey = resolveModelKey({ model: entry.model, infmax_model_prefix: undefined });
@@ -317,6 +322,7 @@ async function ingestSupplementalBmk(
         framework,
         specMethod,
         disagg,
+        benchmarkType: 'single_turn',
       });
     }
     if (availRows.length > 0) {
diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts
index 0d9373d3..f09a2686 100644
--- a/packages/db/src/json-provider.ts
+++ b/packages/db/src/json-provider.ts
@@ -290,6 +290,8 @@ function toBenchmarkRow(
     decode_num_workers: c.decode_num_workers,
     num_prefill_gpu: c.num_prefill_gpu,
     num_decode_gpu: c.num_decode_gpu,
+    benchmark_type: br.benchmark_type ?? 'single_turn',
+    offload_mode: (br as { offload_mode?: string }).offload_mode ?? 'off',
     isl: br.isl,
     osl: br.osl,
     conc: br.conc,
@@ -410,7 +412,11 @@ export function getAvailabilityData(): AvailabilityRow[] {
   for (const a of s.availability) {
     const key = `${a.model}|${a.hardware}|${a.framework}|${a.precision}|${a.isl}|${a.osl}|${toDateString(a.date)}`;
     if (validKeys.has(key)) {
-      rows.push({ ...a, date: toDateString(a.date) });
+      rows.push({
+        ...a,
+        benchmark_type: (a as { benchmark_type?: string }).benchmark_type ?? 'single_turn',
+        date: toDateString(a.date),
+      });
     }
   }
 
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 1c30b1fd..74e20380 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -18,9 +18,13 @@ export interface BenchmarkRow {
   decode_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
-  isl: number;
-  osl: number;
+  benchmark_type: string;
+  // Null for agentic_traces; numeric for single_turn fixed-seq runs.
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */
+  offload_mode: string;
   image: string | null;
   metrics: Record<string, number>;
   date: string;
@@ -68,6 +72,8 @@ export async function getLatestBenchmarks(
         c.decode_num_workers,
         c.num_prefill_gpu,
         c.num_decode_gpu,
+        br.benchmark_type,
+        br.offload_mode,
         br.isl,
         br.osl,
         br.conc,
@@ -106,6 +112,8 @@ export async function getLatestBenchmarks(
       c.decode_num_workers,
       c.num_prefill_gpu,
       c.num_decode_gpu,
+      lb.benchmark_type,
+      lb.offload_mode,
       lb.isl,
       lb.osl,
       lb.conc,
@@ -153,6 +161,7 @@ export async function getAllBenchmarksForHistory(
       c.decode_num_workers,
       c.num_prefill_gpu,
       c.num_decode_gpu,
+      br.benchmark_type,
       br.isl,
       br.osl,
       br.conc,
diff --git a/packages/db/src/queries/workflow-info.ts b/packages/db/src/queries/workflow-info.ts
index b4e4f255..d5e2d933 100644
--- a/packages/db/src/queries/workflow-info.ts
+++ b/packages/db/src/queries/workflow-info.ts
@@ -88,20 +88,22 @@ export async function getDateConfigs(sql: DbClient, date: string): Promise<DateC
 
 export interface AvailabilityRow {
   model: string;
-  isl: number;
-  osl: number;
+  // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+  isl: number | null;
+  osl: number | null;
   precision: string;
   hardware: string;
   framework: string;
   spec_method: string;
   disagg: boolean;
+  benchmark_type: string;
   date: string;
 }
 
-/** Get available (model, ISL/OSL, precision, hardware, framework, spec_method, date) combos for the availability API. */
+/** Get available (model, ISL/OSL, precision, hardware, framework, spec_method, benchmark_type, date) combos for the availability API. */
 export async function getAvailabilityData(sql: DbClient): Promise<AvailabilityRow[]> {
   const rows = await sql`
-    SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.date::text
+    SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.benchmark_type, a.date::text
     FROM availability a
     WHERE EXISTS (
       SELECT 1
@@ -112,8 +114,9 @@ export async function getAvailabilityData(sql: DbClient): Promise<AvailabilityRo
         AND c.hardware = a.hardware
         AND c.framework = a.framework
         AND c.precision = a.precision
-        AND br.isl = a.isl
-        AND br.osl = a.osl
+        AND br.isl IS NOT DISTINCT FROM a.isl
+        AND br.osl IS NOT DISTINCT FROM a.osl
+        AND br.benchmark_type = a.benchmark_type
         AND br.date = a.date
         AND br.error IS NULL
         AND wr.conclusion IS NOT NULL

From 9c43a762cdaf9edd0091ef9d3034d4a974071c6d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 30 Apr 2026 19:01:56 -0500
Subject: [PATCH 02/55] =?UTF-8?q?fix:=20agentic=20offload=20variants=20?=
 =?UTF-8?q?=E2=80=94=20render=20both=20halos=20+=20map=20renamed=20fields?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- ScatterGraph: include `offload_mode` in `buildPointConfigId` so d3's data
  join keeps both `on` and `off` variants for the same (config, conc).
  Without it, the second variant collapsed onto the first key, so FP8
  offload-on points (and their halos) silently disappeared.
- benchmark-mapper: handle older artifacts that emit `users`/`offload_mode`
  AND newer ones that emit `conc`/`offloading` (with 'none' → 'off' mapping).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  |  4 +++
 packages/db/src/etl/benchmark-mapper.ts       | 27 ++++++++++++-------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 15bb60f0..55a206ce 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -295,6 +295,10 @@ const ScatterGraph = React.memo(
     const buildPointConfigId = useCallback((point: InferenceData): string => {
       let key = `${point.hwKey}|${point.precision}|${point.tp}|${point.conc}|${point.decode_ep ?? 0}|${point.prefill_tp ?? 0}|${point.prefill_ep ?? 0}`;
       if (point.disagg) key += `|disagg|${point.num_prefill_gpu ?? 0}|${point.num_decode_gpu ?? 0}`;
+      // Agentic runs emit two rows per (config, conc) — one offload=on, one off.
+      // Without this suffix, d3's data join treats them as the same point and
+      // drops one variant (along with its halo).
+      if (point.offload_mode) key += `|offload-${point.offload_mode}`;
       return key;
     }, []);
 
diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index 5b120843..d842276e 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -138,12 +138,24 @@ export function mapBenchmarkRow(
 
   const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null);
   const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null);
-  const conc = isAgentic ? parseInt2(row.users) : parseInt2(row.conc);
+  // Agentic artifacts encode concurrency as `users` in older schemas and `conc` in newer ones.
+  const conc = isAgentic ? (parseInt2(row.users) ?? parseInt2(row.conc)) : parseInt2(row.conc);
   if (!conc || (!isAgentic && (!isl || !osl))) {
     tracker.skips.noIslOsl++;
     return null;
   }
 
+  // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading`
+  // ('none' → 'off'; any other non-empty value → 'on').
+  const offloadModeRaw =
+    typeof row.offload_mode === 'string' && row.offload_mode.length > 0
+      ? row.offload_mode
+      : typeof row.offloading === 'string' && row.offloading.length > 0
+        ? row.offloading === 'none'
+          ? 'off'
+          : 'on'
+        : 'off';
+
   const { framework, disagg } = normalizeFramework(String(row.framework ?? ''), row.disagg);
   const isMultinode = parseBool(row.is_multinode);
   const precision = normalizePrecision(String(row.precision ?? ''));
@@ -204,10 +216,10 @@ export function mapBenchmarkRow(
     }
   }
 
-  // Agentic rows emit `offload_mode: "on" | "off"` as a string — preserve it
-  // as a stringified metric so the frontend can expose it in tooltips.
-  if (isAgentic && typeof row.offload_mode === 'string') {
-    (metrics as Record<string, unknown>).offload_mode = row.offload_mode;
+  // Agentic rows emit `offload_mode: "on" | "off"` (or older `offloading: "none"|...`)
+  // — preserve as a stringified metric so the frontend can expose it in tooltips.
+  if (isAgentic) {
+    (metrics as Record<string, unknown>).offload_mode = offloadModeRaw;
   }
 
   // Artifact names encode '/' as '#' to avoid path separators; restore the URI.
@@ -237,10 +249,7 @@ export function mapBenchmarkRow(
     isl,
     osl,
     conc,
-    offloadMode:
-      typeof row.offload_mode === 'string' && row.offload_mode.length > 0
-        ? row.offload_mode
-        : 'off',
+    offloadMode: offloadModeRaw,
     image,
     metrics,
   };

From 07ba10636dae87b5a819afa524d7c10322fae41b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 00:29:55 -0500
Subject: [PATCH 03/55] fix: render offload halo on every offload-on point, not
 just frontier

The halo's purpose is to surface KV-offload usage; restricting it to
Pareto-frontier-only points hid the indicator on most runs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/inference/ui/ScatterGraph.tsx | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 55a206ce..61ac0983 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -1516,10 +1516,9 @@ const ScatterGraph = React.memo(
             .attr('pointer-events', 'none');
         });
 
-        // Offload halo: dashed ring on frontier points that used KV offload
+        // Offload halo: dashed ring on every point that used KV offload (Pareto or not)
         zoomGroup.selectAll<SVGGElement, InferenceData>('.dot-group').each(function (d) {
-          const onFrontier = optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`);
-          const showHalo = onFrontier && d.offload_mode === 'on';
+          const showHalo = d.offload_mode === 'on';
           d3.select(this)
             .selectAll<SVGCircleElement, boolean>('.offload-halo')
             .data(showHalo ? [true] : [])

From 95e9dc77431adf5354ef0df36989816199624383 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 01:13:42 -0500
Subject: [PATCH 04/55] fix: strip runner-pool suffix (-p1, -p2, ...) from hw
 identifier

b300-p1 (and similar) artifacts were skipping ingest because the runner-pool
suffix wasn't in the strip list and didn't normalize to the canonical b300
GPU key.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/db/src/etl/normalizers.ts | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts
index ad12a454..bd497f7a 100644
--- a/packages/db/src/etl/normalizers.ts
+++ b/packages/db/src/etl/normalizers.ts
@@ -34,7 +34,8 @@ export function hwToGpuKey(hw: string): string | null {
     .replace(/-dgxc-slurm$/, '')
     .replace(/-dgxc$/, '')
     .replace(/-nb$/, '')
-    .replace(/-nv$/, '');
+    .replace(/-nv$/, '')
+    .replace(/-p\d+$/, ''); // strip runner-pool suffix (e.g. b300-p1 → b300)
   return GPU_KEYS.has(base) ? base : null;
 }
 

From 982106da5f4421983841304f0503b6467033852d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 09:25:33 -0500
Subject: [PATCH 05/55] feat: bold scatter labels with concurrency tag +
 collision avoidance

- Label text now includes `C=<conc>` alongside the GPU/parallelism tag
  (default `<tp> C=<conc>`, advanced `<getPointLabel> C=<conc>`)
- Bumped point-label font-weight to 700 so the labels read clearly against
  the chart fill
- Greedy collision-avoidance pass on render and zoom: tries placing each
  label above/below the point through 4 candidate dy offsets, hiding the
  label only when no slot is free

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  | 68 ++++++++++++++++++-
 .../src/lib/d3-chart/layers/scatter-points.ts |  1 +
 2 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 61ac0983..3fbd8588 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -55,6 +55,63 @@ import {
   buildGradientColorMap,
 } from '@/components/inference/utils/paretoLabels';
 
+// Greedy label-collision avoidance: try positions above/below the point;
+// hide labels that can't fit anywhere. Re-runs cheaply on each render/zoom.
+function avoidLabelCollisions(
+  zoomGroup: d3.Selection<SVGGElement, unknown, null, undefined>,
+): void {
+  const labels: {
+    el: SVGTextElement;
+    cx: number;
+    cy: number;
+    w: number;
+    h: number;
+  }[] = [];
+  zoomGroup.selectAll<SVGGElement, unknown>('.dot-group').each(function () {
+    const labelEl = this.querySelector<SVGTextElement>('.point-label');
+    if (!labelEl) return;
+    if ((this as SVGGElement).style.opacity === '0') return;
+    const transform = (this as SVGGElement).getAttribute('transform') ?? '';
+    const m = transform.match(/translate\(([^,]+),([^)]+)\)/);
+    if (!m) return;
+    const cx = parseFloat(m[1]);
+    const cy = parseFloat(m[2]);
+    labelEl.setAttribute('dy', '-8');
+    labelEl.style.opacity = '1';
+    const bbox = labelEl.getBBox();
+    labels.push({ el: labelEl, cx, cy, w: bbox.width, h: bbox.height });
+  });
+  labels.sort((a, b) => a.cx - b.cx);
+  const placed: { left: number; right: number; top: number; bottom: number }[] = [];
+  const pad = 1;
+  const candidates = [-8, 14, -22, 28];
+  for (const lab of labels) {
+    let chosenDy: number | null = null;
+    let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null;
+    for (const dy of candidates) {
+      const top = lab.cy + dy - lab.h - pad;
+      const bottom = lab.cy + dy + pad;
+      const left = lab.cx - lab.w / 2 - pad;
+      const right = lab.cx + lab.w / 2 + pad;
+      const collides = placed.some(
+        (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom),
+      );
+      if (!collides) {
+        chosenDy = dy;
+        chosenBox = { left, right, top, bottom };
+        break;
+      }
+    }
+    if (chosenDy !== null && chosenBox) {
+      lab.el.setAttribute('dy', String(chosenDy));
+      lab.el.style.opacity = '1';
+      placed.push(chosenBox);
+    } else {
+      lab.el.style.opacity = '0';
+    }
+  }
+}
+
 // X-shape path for overlay (unofficial) data points
 const X_SIZE = 5;
 const X_HOVER_SIZE = 7;
@@ -603,6 +660,7 @@ const ScatterGraph = React.memo(
               d3.axisLeft(newYS).ticks(10).tickFormat(logTickFormat(newYS)) as any,
             );
           }
+          avoidLabelCollisions(ctx.layout.zoomGroup);
         },
       }),
       [zoomResetEventName, eventPrefix, xScaleConfig._isLog, yScaleConfig.type],
@@ -1251,7 +1309,8 @@ const ScatterGraph = React.memo(
           getOpacity: (d) => (isPointVisible(d) ? 1 : 0),
           getPointerEvents: (d) => (isPointVisible(d) ? 'auto' : 'none'),
           hideLabels: hidePointLabels || showGradientLabels,
-          getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)),
+          getLabelText: (d) =>
+            useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`,
           foreground: 'var(--foreground)',
           dataAttrs: {
             'hw-key': (d) => String(d.hwKey),
@@ -1353,8 +1412,11 @@ const ScatterGraph = React.memo(
                   .attr('text-anchor', 'middle')
                   .style('fill', 'var(--foreground)')
                   .attr('font-size', '10px')
+                  .attr('font-weight', '700')
                   .attr('pointer-events', 'none')
-                  .text(useAdvancedLabels ? getPointLabel(d) : String(d.tp));
+                  .text(
+                    useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`,
+                  );
               });
 
               // Overlay tooltip handlers
@@ -1566,6 +1628,8 @@ const ScatterGraph = React.memo(
             });
           });
 
+        avoidLabelCollisions(zoomGroup);
+
         // Log tick formatting on initial render
         if (xScaleConfig._isLog) {
           const xScale = ctx.xScale as d3.ScaleLogarithmic<number, number>;
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index 507654e1..9f2d2f38 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -72,6 +72,7 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
       .attr('text-anchor', 'middle')
       .attr('fill', config.foreground)
       .attr('font-size', '10px')
+      .attr('font-weight', '700')
       .attr('pointer-events', 'none')
       .text(config.getLabelText);
   }

From 9572b95e86de7cece1179b5f48dd29135765002b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 09:32:44 -0500
Subject: [PATCH 06/55] fix: stack multi-line point labels upward so they don't
 overlap the point
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tspans now ride above the text's `dy` anchor — the LAST line sits at the
anchor (just above the point) and earlier lines stack above it. Previously
the second tspan landed below the anchor and crashed into the marker.

Also widened collision candidates by label height so the flipped-below
position fully clears the point on multi-line labels.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  | 28 +++++++---
 .../src/lib/d3-chart/layers/scatter-points.ts | 52 +++++++++++++------
 2 files changed, 58 insertions(+), 22 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 3fbd8588..f8ce9b8f 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -84,8 +84,11 @@ function avoidLabelCollisions(
   labels.sort((a, b) => a.cx - b.cx);
   const placed: { left: number; right: number; top: number; bottom: number }[] = [];
   const pad = 1;
-  const candidates = [-8, 14, -22, 28];
   for (const lab of labels) {
+    // Candidates scale with the label's own height so multi-line labels don't
+    // overlap the point shape when flipped below.
+    const below = lab.h + 8;
+    const candidates = [-8, below, -8 - below - 4, 2 * below];
     let chosenDy: number | null = null;
     let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null;
     for (const dy of candidates) {
@@ -1310,7 +1313,7 @@ const ScatterGraph = React.memo(
           getPointerEvents: (d) => (isPointVisible(d) ? 'auto' : 'none'),
           hideLabels: hidePointLabels || showGradientLabels,
           getLabelText: (d) =>
-            useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`,
+            useAdvancedLabels ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`,
           foreground: 'var(--foreground)',
           dataAttrs: {
             'hw-key': (d) => String(d.hwKey),
@@ -1403,7 +1406,14 @@ const ScatterGraph = React.memo(
               // Labels
               const showLabels = !hidePointLabels && !showGradientLabels;
               overlayPoints.each(function (d) {
-                d3.select(this)
+                const lines = showLabels
+                  ? (useAdvancedLabels
+                      ? `${getPointLabel(d)}\nC=${d.conc}`
+                      : `${d.tp}\nC=${d.conc}`
+                    ).split('\n')
+                  : [];
+                const text = d3
+                  .select(this)
                   .selectAll<SVGTextElement, boolean>('.overlay-label')
                   .data(showLabels ? [true] : [])
                   .join('text')
@@ -1413,10 +1423,14 @@ const ScatterGraph = React.memo(
                   .style('fill', 'var(--foreground)')
                   .attr('font-size', '10px')
                   .attr('font-weight', '700')
-                  .attr('pointer-events', 'none')
-                  .text(
-                    useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`,
-                  );
+                  .attr('pointer-events', 'none');
+                text
+                  .selectAll<SVGTSpanElement, string>('tspan')
+                  .data(lines)
+                  .join('tspan')
+                  .attr('x', 0)
+                  .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'))
+                  .text((l) => l);
               });
 
               // Overlay tooltip handlers
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index 9f2d2f38..13c588d8 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -63,18 +63,30 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
     applyNormalState(shape, d.precision);
   });
 
-  // Label (enter only)
+  // Label (enter only). Multi-line labels are passed as `\n`-separated strings;
+  // we stack tspans UPWARD from the text's `dy` anchor so the LAST line sits
+  // at `dy` (just above the point) and earlier lines land above it. That way,
+  // the collision-avoidance pass only has to move the `<text>` element — the
+  // intra-stack offsets stay correct whether the label ends up above or below.
   if (!config.hideLabels && config.getLabelText && config.foreground) {
-    entered
-      .append('text')
-      .attr('class', 'point-label')
-      .attr('dy', -8)
-      .attr('text-anchor', 'middle')
-      .attr('fill', config.foreground)
-      .attr('font-size', '10px')
-      .attr('font-weight', '700')
-      .attr('pointer-events', 'none')
-      .text(config.getLabelText);
+    const labelGetter = config.getLabelText;
+    entered.each(function (d) {
+      const lines = labelGetter(d).split('\n');
+      const text = d3
+        .select(this)
+        .append('text')
+        .attr('class', 'point-label')
+        .attr('dy', -8)
+        .attr('text-anchor', 'middle')
+        .attr('fill', config.foreground!)
+        .attr('font-size', '10px')
+        .attr('font-weight', '700')
+        .attr('pointer-events', 'none');
+      lines.forEach((line, i) => {
+        const tspanDy = i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em';
+        text.append('tspan').attr('x', 0).attr('dy', tspanDy).text(line);
+      });
+    });
   }
 
   // Exit: remove stale points
@@ -103,9 +115,12 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
 
   // Update labels: use data join so labels are created/removed properly on toggle
   if (!config.hideLabels && config.getLabelText && config.foreground) {
+    const labelGetter = config.getLabelText;
     points.each(function (d) {
-      const g = d3.select(this);
-      g.selectAll<SVGTextElement, boolean>('.point-label')
+      const lines = labelGetter(d).split('\n');
+      const text = d3
+        .select(this)
+        .selectAll<SVGTextElement, boolean>('.point-label')
         .data([true])
         .join('text')
         .attr('class', 'point-label')
@@ -113,8 +128,15 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
         .attr('text-anchor', 'middle')
         .attr('fill', config.foreground!)
         .attr('font-size', '10px')
-        .attr('pointer-events', 'none')
-        .text(config.getLabelText!(d));
+        .attr('font-weight', '700')
+        .attr('pointer-events', 'none');
+      text
+        .selectAll<SVGTSpanElement, string>('tspan')
+        .data(lines)
+        .join('tspan')
+        .attr('x', 0)
+        .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'))
+        .text((l) => l);
     });
   } else {
     points.selectAll('.point-label').remove();

From 37eecc6e28c10751ffc52c8a0d0588177e43d4d8 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 09:38:39 -0500
Subject: [PATCH 07/55] fix: anchor multi-line labels via first tspan +
 tspan-aware collision pass
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a `<text>` contains tspans, the parent's `dy` does not shift the bbox
cleanly — its (unused) y=0 origin still factors in, so the rendered text
ended up centered on the point. Move the absolute offset into the FIRST
tspan's `dy`; later tspans cascade by 1.1em.

Collision avoidance now drives the first tspan's `dy` and tries four
candidate baselines (primary above, primary below, secondary above,
secondary below), accounting for full label height when picking a non-
overlapping slot. Labels still hidden as a last resort.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  | 72 +++++++++++++------
 .../src/lib/d3-chart/layers/scatter-points.ts | 25 ++++---
 2 files changed, 66 insertions(+), 31 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index f8ce9b8f..27d3680c 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -55,58 +55,88 @@ import {
   buildGradientColorMap,
 } from '@/components/inference/utils/paretoLabels';
 
-// Greedy label-collision avoidance: try positions above/below the point;
-// hide labels that can't fit anywhere. Re-runs cheaply on each render/zoom.
+// Greedy label-collision avoidance.
+// Each candidate is the y-position of the FIRST baseline (relative to point
+// center) which we apply via the first tspan's `dy` — later tspans cascade
+// down by 1.1em. We try above/below at primary and secondary offsets, and
+// hide the label if all four positions collide.
 function avoidLabelCollisions(
   zoomGroup: d3.Selection<SVGGElement, unknown, null, undefined>,
 ): void {
-  const labels: {
+  interface LabelInfo {
     el: SVGTextElement;
+    firstTspan: SVGTSpanElement;
     cx: number;
     cy: number;
     w: number;
-    h: number;
-  }[] = [];
+    nLines: number;
+    defaultFirstY: number;
+  }
+  const labels: LabelInfo[] = [];
+  const ASCENT = 9;
+  const DESCENT = 3;
+  const LINE_H = 11;
+
   zoomGroup.selectAll<SVGGElement, unknown>('.dot-group').each(function () {
     const labelEl = this.querySelector<SVGTextElement>('.point-label');
     if (!labelEl) return;
     if ((this as SVGGElement).style.opacity === '0') return;
+    const tspans = labelEl.querySelectorAll<SVGTSpanElement>('tspan');
+    if (tspans.length === 0) return;
     const transform = (this as SVGGElement).getAttribute('transform') ?? '';
     const m = transform.match(/translate\(([^,]+),([^)]+)\)/);
     if (!m) return;
     const cx = parseFloat(m[1]);
     const cy = parseFloat(m[2]);
-    labelEl.setAttribute('dy', '-8');
+    const nLines = tspans.length;
+    const defaultFirstY = -(8 + (nLines - 1) * LINE_H); // last baseline 8px above point
+    // Reset to default before measuring so prior positioning doesn't bias bbox
+    tspans[0].setAttribute('dy', `${defaultFirstY}px`);
     labelEl.style.opacity = '1';
     const bbox = labelEl.getBBox();
-    labels.push({ el: labelEl, cx, cy, w: bbox.width, h: bbox.height });
+    labels.push({
+      el: labelEl,
+      firstTspan: tspans[0],
+      cx,
+      cy,
+      w: bbox.width,
+      nLines,
+      defaultFirstY,
+    });
   });
+
   labels.sort((a, b) => a.cx - b.cx);
   const placed: { left: number; right: number; top: number; bottom: number }[] = [];
-  const pad = 1;
+  const pad = 2;
+
   for (const lab of labels) {
-    // Candidates scale with the label's own height so multi-line labels don't
-    // overlap the point shape when flipped below.
-    const below = lab.h + 8;
-    const candidates = [-8, below, -8 - below - 4, 2 * below];
-    let chosenDy: number | null = null;
+    const blockH = (lab.nLines - 1) * LINE_H + ASCENT + DESCENT;
+    const aboveFirstY = lab.defaultFirstY;
+    const belowFirstY = 14; // first baseline 14px below point center
+    const candidates = [
+      aboveFirstY,
+      belowFirstY,
+      aboveFirstY - blockH - 2,
+      belowFirstY + blockH + 2,
+    ];
+    let chosenY: number | null = null;
     let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null;
-    for (const dy of candidates) {
-      const top = lab.cy + dy - lab.h - pad;
-      const bottom = lab.cy + dy + pad;
+    for (const firstY of candidates) {
+      const top = lab.cy + firstY - ASCENT - pad;
+      const bottom = lab.cy + firstY + (lab.nLines - 1) * LINE_H + DESCENT + pad;
       const left = lab.cx - lab.w / 2 - pad;
       const right = lab.cx + lab.w / 2 + pad;
       const collides = placed.some(
         (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom),
       );
       if (!collides) {
-        chosenDy = dy;
+        chosenY = firstY;
         chosenBox = { left, right, top, bottom };
         break;
       }
     }
-    if (chosenDy !== null && chosenBox) {
-      lab.el.setAttribute('dy', String(chosenDy));
+    if (chosenY !== null && chosenBox) {
+      lab.firstTspan.setAttribute('dy', `${chosenY}px`);
       lab.el.style.opacity = '1';
       placed.push(chosenBox);
     } else {
@@ -1418,18 +1448,18 @@ const ScatterGraph = React.memo(
                   .data(showLabels ? [true] : [])
                   .join('text')
                   .attr('class', 'overlay-label')
-                  .attr('dy', -10)
                   .attr('text-anchor', 'middle')
                   .style('fill', 'var(--foreground)')
                   .attr('font-size', '10px')
                   .attr('font-weight', '700')
                   .attr('pointer-events', 'none');
+                const firstDy = -(1 + (lines.length - 1) * 1.1);
                 text
                   .selectAll<SVGTSpanElement, string>('tspan')
                   .data(lines)
                   .join('tspan')
                   .attr('x', 0)
-                  .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'))
+                  .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em'))
                   .text((l) => l);
               });
 
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index 13c588d8..71d1f050 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -64,10 +64,10 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
   });
 
   // Label (enter only). Multi-line labels are passed as `\n`-separated strings;
-  // we stack tspans UPWARD from the text's `dy` anchor so the LAST line sits
-  // at `dy` (just above the point) and earlier lines land above it. That way,
-  // the collision-avoidance pass only has to move the `<text>` element — the
-  // intra-stack offsets stay correct whether the label ends up above or below.
+  // we anchor the entire stack via the FIRST tspan's `dy` so getBBox() doesn't
+  // pick up the text element's own (unused) y=0 origin. The first tspan is
+  // raised so the LAST line baseline lands ~8px above the point; subsequent
+  // tspans cascade down by 1.1em.
   if (!config.hideLabels && config.getLabelText && config.foreground) {
     const labelGetter = config.getLabelText;
     entered.each(function (d) {
@@ -76,15 +76,18 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
         .select(this)
         .append('text')
         .attr('class', 'point-label')
-        .attr('dy', -8)
         .attr('text-anchor', 'middle')
         .attr('fill', config.foreground!)
         .attr('font-size', '10px')
         .attr('font-weight', '700')
         .attr('pointer-events', 'none');
+      const firstDy = -(0.8 + (lines.length - 1) * 1.1);
       lines.forEach((line, i) => {
-        const tspanDy = i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em';
-        text.append('tspan').attr('x', 0).attr('dy', tspanDy).text(line);
+        text
+          .append('tspan')
+          .attr('x', 0)
+          .attr('dy', i === 0 ? `${firstDy}em` : '1.1em')
+          .text(line);
       });
     });
   }
@@ -113,7 +116,9 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
   // Update colors on existing shapes (handles hw color changes)
   points.select('.visible-shape').attr('fill', config.getColor as any);
 
-  // Update labels: use data join so labels are created/removed properly on toggle
+  // Update labels: use data join so labels are created/removed properly on toggle.
+  // Anchor the stack via the first tspan (NOT the text dy — that doesn't shift the
+  // bbox cleanly when there are tspan children).
   if (!config.hideLabels && config.getLabelText && config.foreground) {
     const labelGetter = config.getLabelText;
     points.each(function (d) {
@@ -124,18 +129,18 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
         .data([true])
         .join('text')
         .attr('class', 'point-label')
-        .attr('dy', -8)
         .attr('text-anchor', 'middle')
         .attr('fill', config.foreground!)
         .attr('font-size', '10px')
         .attr('font-weight', '700')
         .attr('pointer-events', 'none');
+      const firstDy = -(0.8 + (lines.length - 1) * 1.1);
       text
         .selectAll<SVGTSpanElement, string>('tspan')
         .data(lines)
         .join('tspan')
         .attr('x', 0)
-        .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'))
+        .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em'))
         .text((l) => l);
     });
   } else {

From f317377dfaea35f9cb5dc435ea177966aa17fbf8 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 10:21:00 -0500
Subject: [PATCH 08/55] fix: dedupe artifacts by logical name + skip
 0-successful agg rows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two complementary fixes for runs whose `results_bmk` aggregated artifact
ends up containing both a successful row and a failed-attempt row for the
same (config, conc, offload) — the failed row's null metrics were
overwriting the good row via ON CONFLICT DO UPDATE.

1. Artifact-level: strip the trailing `_<runner-pool>_<attempt>` suffix
   from each artifact name and group by the logical name, keeping only the
   most recent per group.

2. Row-level: skip rows with `num_requests_successful === 0` AND
   `num_requests_total > 0`. The aggregated artifact merges rows from all
   runners — including failed ones — so artifact-level dedup alone can't
   reach inside it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/db/src/etl/benchmark-mapper.ts | 14 +++++++++++
 packages/db/src/etl/skip-tracker.ts     | 10 +++++++-
 packages/db/src/ingest-ci-run.ts        | 33 ++++++++++++++++++++-----
 packages/db/src/ingest-gcs-backup.ts    |  1 +
 4 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index d842276e..1aff5ea9 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -145,6 +145,20 @@ export function mapBenchmarkRow(
     return null;
   }
 
+  // Failed-run guard: aggregated artifacts (`results_bmk`) merge rows from
+  // every runner, including ones with 0 successful requests and null metrics.
+  // Without this skip, the empty row's nulls overwrite a good row via
+  // ON CONFLICT DO UPDATE when both share the same (config, conc, offload).
+  if (
+    typeof row.num_requests_successful === 'number' &&
+    row.num_requests_successful === 0 &&
+    typeof row.num_requests_total === 'number' &&
+    row.num_requests_total > 0
+  ) {
+    tracker.skips.failedRun++;
+    return null;
+  }
+
   // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading`
   // ('none' → 'off'; any other non-empty value → 'on').
   const offloadModeRaw =
diff --git a/packages/db/src/etl/skip-tracker.ts b/packages/db/src/etl/skip-tracker.ts
index 6166ea44..588718dd 100644
--- a/packages/db/src/etl/skip-tracker.ts
+++ b/packages/db/src/etl/skip-tracker.ts
@@ -8,6 +8,7 @@ export interface Skips {
   unmappedModel: number;
   unmappedHw: number;
   noIslOsl: number;
+  failedRun: number;
   dbError: number;
 }
 
@@ -66,7 +67,14 @@ const MAX_DB_ERRORS = 10;
  * @returns A `SkipTracker` with zeroed counters and empty unmapped-name sets.
  */
 export function createSkipTracker(): SkipTracker {
-  const skips: Skips = { badZip: 0, unmappedModel: 0, unmappedHw: 0, noIslOsl: 0, dbError: 0 };
+  const skips: Skips = {
+    badZip: 0,
+    unmappedModel: 0,
+    unmappedHw: 0,
+    noIslOsl: 0,
+    failedRun: 0,
+    dbError: 0,
+  };
   const unmappedModels = new Set<string>();
   const unmappedHws = new Set<string>();
   const unmappedPrecisions = new Set<string>();
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index 8cce43ca..fb1fbbbc 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -101,15 +101,30 @@ if (isDownloadMode) {
     } catch {}
   }
 
-  const byName = new Map<string, (typeof allArtifacts)[0]>();
+  // Strip the trailing `_<runner-pool>_<attempt-digits>` token from each
+  // artifact name, then group by the resulting logical name and keep only
+  // the most recent per group. Without this, two artifacts produced on
+  // different runners for the same logical config (e.g. `…_h200-cw_00` and
+  // `…_h200-dgxc-slurm_1`) both land in the DB and the failed one's empty
+  // metrics can overwrite the good one via ON CONFLICT DO UPDATE.
+  //
+  // The runner pool name itself has no underscores (`h200-cw`,
+  // `h200-dgxc-slurm`, `b200-nb`), so `[a-zA-Z0-9.-]*` keeps the strip
+  // bounded — using `\w` here would over-match across earlier `_`
+  // separators and collapse different (conc, offload) variants into the
+  // same logical name.
+  const RUNNER_SUFFIX_RE = /_[a-zA-Z][a-zA-Z0-9.-]*_\d+$/;
+  const byLogical = new Map<string, (typeof allArtifacts)[0]>();
   for (const a of allArtifacts) {
-    const existing = byName.get(a.name);
+    const key = a.name.replace(RUNNER_SUFFIX_RE, '');
+    const existing = byLogical.get(key);
     if (!existing || a.created_at > existing.created_at) {
-      byName.set(a.name, a);
+      byLogical.set(key, a);
     }
   }
 
-  for (const [name, artifact] of byName) {
+  for (const [, artifact] of byLogical) {
+    const name = artifact.name;
     console.log(`  ${name}`);
     const zipPath = path.join(artifactsDir, 'artifact.zip');
     execSync(`gh api "${artifact.archive_download_url}" > "${zipPath}"`, {
@@ -121,7 +136,7 @@ if (isDownloadMode) {
     fs.unlinkSync(zipPath);
   }
 
-  console.log(`\n  Downloaded ${byName.size} artifact(s)`);
+  console.log(`\n  Downloaded ${byLogical.size} artifact(s)`);
 
   // Fetch run attempt from API
   const attemptStr = execSync(
@@ -510,11 +525,17 @@ async function main(): Promise<void> {
 
   const { skips, unmappedModels, unmappedHws, unmappedPrecisions } = tracker;
   const totalSkips =
-    skips.badZip + skips.unmappedModel + skips.unmappedHw + skips.noIslOsl + skips.dbError;
+    skips.badZip +
+    skips.unmappedModel +
+    skips.unmappedHw +
+    skips.noIslOsl +
+    skips.failedRun +
+    skips.dbError;
   if (totalSkips > 0) {
     console.log(`\n  Skipped: ${totalSkips} rows`);
     const skipLines: [string, number][] = [
       ['no isl/osl (old format)', skips.noIslOsl],
+      ['failed run (0 successful)', skips.failedRun],
       ['unmapped model', skips.unmappedModel],
       ['unmapped hw', skips.unmappedHw],
       ['bad/empty zip', skips.badZip],
diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts
index 6dc604e9..d67f5164 100644
--- a/packages/db/src/ingest-gcs-backup.ts
+++ b/packages/db/src/ingest-gcs-backup.ts
@@ -434,6 +434,7 @@ async function mapWorkflowDir(
       unmappedModel: local.skips.unmappedModel,
       unmappedHw: local.skips.unmappedHw,
       noIslOsl: local.skips.noIslOsl,
+      failedRun: local.skips.failedRun,
     },
     localUnmappedModels: new Set(local.unmappedModels),
     localUnmappedHws: new Set(local.unmappedHws),

From c2f66f62f5a1dedb6a87c7c5e58ca990b3cb0956 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 7 May 2026 08:41:26 -0500
Subject: [PATCH 09/55] feat: add AIPerf to FRAMEWORK_LABELS

Tag display name for the `aiperf` spec_method suffix used by the
alternate-harness runs ingested for the agentic minimax sweep.
Without this entry the legend shows 'AIPERF' from the default
toUpperCase fallback.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/constants/src/framework-aliases.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/packages/constants/src/framework-aliases.ts b/packages/constants/src/framework-aliases.ts
index cc5eb6b4..e23a93bc 100644
--- a/packages/constants/src/framework-aliases.ts
+++ b/packages/constants/src/framework-aliases.ts
@@ -44,6 +44,7 @@ export const FRAMEWORK_LABELS: Record<string, string> = {
     ]),
   ),
   mtp: 'MTP',
+  aiperf: 'AIPerf',
 };
 
 /**

From 024797a978a2a6e2954f66a963de3205b62a149e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 12 May 2026 15:02:07 -0500
Subject: [PATCH 10/55] fix(changelog): coerce ids to string when filtering
 changelog by run

bigint workflow_run_id sometimes deserializes as a number on the
frontend depending on the postgres adapter's behavior; strict ===
between a number and a string silently dropped every match, so the
changelog popover always reported "no changelog data available."

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/GlobalFilterContext.tsx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx
index 08fc7094..11e56de7 100644
--- a/packages/app/src/components/GlobalFilterContext.tsx
+++ b/packages/app/src/components/GlobalFilterContext.tsx
@@ -87,7 +87,9 @@ function buildRunInfo(data: WorkflowInfoResponse): Record<string, RunInfo> {
   const runs: Record<string, RunInfo> = {};
   for (const run of data.runs) {
     const runId = String(run.github_run_id);
-    const runChangelogs = data.changelogs.filter((c) => c.workflow_run_id === run.github_run_id);
+    const runChangelogs = data.changelogs.filter(
+      (c) => String(c.workflow_run_id) === String(run.github_run_id),
+    );
     runs[runId] = {
       runId,
       runDate: run.created_at,

From aa154193dfbc12535f25444cdf6fccc16a3e1382 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 12 May 2026 15:36:57 -0500
Subject: [PATCH 11/55] feat: default sequence to Agentic Traces when available

If the selected model has agentic_traces data, prefer that over the
default 8K/1K fixed-seq when the user hasn't explicitly chosen via URL.
effectiveSequence already falls back to availableSequences[0] for models
without agentic, so models with only fixed-seq data still render correctly.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/GlobalFilterContext.tsx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx
index 11e56de7..7813d079 100644
--- a/packages/app/src/components/GlobalFilterContext.tsx
+++ b/packages/app/src/components/GlobalFilterContext.tsx
@@ -125,7 +125,9 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) {
   const [selectedSequence, setSelectedSequence] = useState<Sequence>(() => {
     const urlSeq = getUrlParam('i_seq');
     if (urlSeq && Object.values(Sequence).includes(urlSeq as Sequence)) return urlSeq as Sequence;
-    return Sequence.EightK_OneK;
+    // Prefer Agentic Traces by default when the selected model has it; the
+    // effectiveSequence fallback below handles models without agentic data.
+    return Sequence.AgenticTraces;
   });
 
   const [selectedPrecisions, setSelectedPrecisionsRaw] = useState<string[]>(() => {

From 099a33efcb53f5130dc40d715a0f4b86d6136a93 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 12:25:25 -0500
Subject: [PATCH 12/55] fix(agentic): respect percentile selector for
 input-throughput x axis
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

rowToAggDataEntry was only copying median/p99 metric variants — picking
p90/p99.9 in the percentile selector silently fell back to 0 and
collapsed every point into a vertical line at x=0. Copy the full
median/p90/p99/p99.9 set into AggDataEntry.

Hide the X-Axis Metric dropdown for agentic mode (it doubled up with the
percentile selector) and route the input-metric chart through
withPercentile so picking p99 actually plots p99_ttft instead of the
hard-coded p99_ttft config default. Percentile options pared back to
median + p99.
---
 .../inference/hooks/useChartData.ts           | 46 +++++++++++++++++--
 .../app/src/components/inference/types.ts     | 10 ++++
 .../components/inference/ui/ChartControls.tsx |  3 +-
 packages/app/src/lib/benchmark-transform.ts   | 12 ++++-
 packages/app/src/lib/data-mappings.ts         |  8 +---
 packages/app/src/lib/energy-metrics.test.ts   | 10 ++++
 6 files changed, 77 insertions(+), 12 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 81ab0780..57e9a1c2 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -16,7 +16,7 @@ import { filterDataByCostLimit } from '@/components/inference/utils';
 import { useBenchmarks, benchmarkQueryOptions } from '@/hooks/api/use-benchmarks';
 import { GPU_ALIAS_TO_CANONICAL, getModelSortIndex } from '@/lib/constants';
 import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform';
-import type { Model, Sequence } from '@/lib/data-mappings';
+import { Sequence, type Model } from '@/lib/data-mappings';
 import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils';
 
 /** Build deduplicated comparison dates, excluding the main run date. */
@@ -216,7 +216,14 @@ export function useChartData(
             ? 'P99 Time To First Token (s)'
             : 'Median Time To First Token (s)';
 
-        if (effectiveXMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
+        const isAgentic = selectedSequence === Sequence.AgenticTraces;
+
+        if (
+          effectiveXMetric &&
+          chartDef.chartType === 'interactivity' &&
+          isInputMetric &&
+          !isAgentic
+        ) {
           xAxisField = effectiveXMetric as keyof AggDataEntry;
           const labelKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition;
           if (effectiveXMetric === chartDef[`${selectedYAxisMetric}_x` as keyof ChartDefinition]) {
@@ -225,15 +232,40 @@ export function useChartData(
             xAxisLabel = isTtftOverride ? ttftLabel : chartDef.x_label;
           }
         } else if (chartDef.chartType === 'interactivity' && isInputMetric) {
+          // Agentic falls through here too — the manual X-axis dropdown is
+          // hidden in agentic mode (would double up with the percentile
+          // selector), so the config default + percentile post-processing
+          // below drives the x axis.
           const xOverrideKey = `${selectedYAxisMetric}_x` as keyof ChartDefinition;
           const xLabelOverrideKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition;
           xAxisField = (chartDef[xOverrideKey] as keyof AggDataEntry) || chartDef.x;
           xAxisLabel = (chartDef[xLabelOverrideKey] as string) || chartDef.x_label;
-        } else if (chartDef.chartType === 'e2e' && isTtftOverride) {
+        } else if (chartDef.chartType === 'e2e' && isTtftOverride && !isAgentic) {
           xAxisField = effectiveXMetric as keyof AggDataEntry;
           xAxisLabel = ttftLabel;
         }
 
+        // Agentic: rewrite the resolved x metric to the chosen percentile,
+        // and relabel accordingly. naturalX is already percentile-adjusted,
+        // so the per-metric override path is the only one that actually
+        // changes here.
+        if (isAgentic) {
+          const adjusted = withPercentile(
+            xAxisField as string,
+            selectedPercentile,
+          ) as keyof AggDataEntry;
+          if (adjusted !== xAxisField) {
+            const pctlWord =
+              selectedPercentile === 'median'
+                ? 'Median'
+                : selectedPercentile === 'p99.9'
+                  ? 'P99.9'
+                  : selectedPercentile.toUpperCase();
+            xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
+            xAxisField = adjusted;
+          }
+        }
+
         // The x-axis is "flipped" only when the good-direction reverses
         // (e.g. interactivity → TTFT: "higher is better" → "lower is better").
         // E2EL → TTFT keeps the same direction ("lower is better" for both),
@@ -269,7 +301,13 @@ export function useChartData(
           xAxisField,
         };
       }),
-    [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, selectedPercentile],
+    [
+      selectedYAxisMetric,
+      selectedXAxisMetric,
+      selectedE2eXAxisMetric,
+      selectedPercentile,
+      selectedSequence,
+    ],
   );
 
   // Build renderable graphs (data processing + stable chart definitions)
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index a2d9ef2e..cddeba54 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -50,23 +50,33 @@ export interface AggDataEntry {
   mean_ttft: number;
   median_ttft: number;
   std_ttft: number;
+  p90_ttft: number;
   p99_ttft: number;
+  'p99.9_ttft': number;
   mean_tpot: number;
   mean_intvty: number;
   median_tpot: number;
   median_intvty: number;
   std_tpot: number;
   std_intvty: number;
+  p90_tpot: number;
+  p90_intvty: number;
   p99_tpot: number;
   p99_intvty: number;
+  'p99.9_tpot': number;
+  'p99.9_intvty': number;
   mean_itl: number;
   median_itl: number;
   std_itl: number;
+  p90_itl: number;
   p99_itl: number;
+  'p99.9_itl': number;
   mean_e2el: number;
   median_e2el: number;
   std_e2el: number;
+  p90_e2el: number;
   p99_e2el: number;
+  'p99.9_e2el': number;
   disagg: boolean;
   num_prefill_gpu: number;
   num_decode_gpu: number;
diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx
index 6707bd9e..7b4fa08f 100644
--- a/packages/app/src/components/inference/ui/ChartControls.tsx
+++ b/packages/app/src/components/inference/ui/ChartControls.tsx
@@ -269,7 +269,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
           </div>
 
           {graphs.some((g) => g.chartDefinition?.chartType === 'interactivity') &&
-            isInputMetric && (
+            isInputMetric &&
+            selectedSequence !== Sequence.AgenticTraces && (
               <div className="flex flex-col space-y-1.5 lg:col-span-1">
                 <LabelWithTooltip
                   htmlFor="x-axis-select"
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index 69745da2..eb62a18a 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -25,7 +25,7 @@ import type { BenchmarkRow } from '@/lib/api';
  */
 function agenticAliases(m: Record<string, number>): Record<string, number> {
   const out: Record<string, number> = {};
-  for (const suffix of ['mean', 'median', 'p90', 'p99']) {
+  for (const suffix of ['mean', 'median', 'p90', 'p99', 'p99.9']) {
     const itl = m[`${suffix}_itl`];
     const ttlt = m[`${suffix}_ttlt`];
     if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt;
@@ -62,23 +62,33 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
     mean_ttft: m.mean_ttft ?? 0,
     median_ttft: m.median_ttft ?? 0,
     std_ttft: m.std_ttft ?? 0,
+    p90_ttft: m.p90_ttft ?? 0,
     p99_ttft: m.p99_ttft ?? 0,
+    'p99.9_ttft': m['p99.9_ttft'] ?? 0,
     mean_tpot: m.mean_tpot ?? 0,
     median_tpot: m.median_tpot ?? 0,
     std_tpot: m.std_tpot ?? 0,
+    p90_tpot: m.p90_tpot ?? 0,
     p99_tpot: m.p99_tpot ?? 0,
+    'p99.9_tpot': m['p99.9_tpot'] ?? 0,
     mean_intvty: m.mean_intvty ?? 0,
     median_intvty: m.median_intvty ?? 0,
     std_intvty: m.std_intvty ?? 0,
+    p90_intvty: m.p90_intvty ?? 0,
     p99_intvty: m.p99_intvty ?? 0,
+    'p99.9_intvty': m['p99.9_intvty'] ?? 0,
     mean_itl: m.mean_itl ?? 0,
     median_itl: m.median_itl ?? 0,
     std_itl: m.std_itl ?? 0,
+    p90_itl: m.p90_itl ?? 0,
     p99_itl: m.p99_itl ?? 0,
+    'p99.9_itl': m['p99.9_itl'] ?? 0,
     mean_e2el: m.mean_e2el ?? 0,
     median_e2el: m.median_e2el ?? 0,
     std_e2el: m.std_e2el ?? 0,
+    p90_e2el: m.p90_e2el ?? 0,
     p99_e2el: m.p99_e2el ?? 0,
+    'p99.9_e2el': m['p99.9_e2el'] ?? 0,
     disagg: row.disagg,
     num_prefill_gpu: row.num_prefill_gpu,
     num_decode_gpu: row.num_decode_gpu,
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index f137875c..bf48c864 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -186,21 +186,17 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 /**
  * Percentile of the latency distribution used for the chart x-axis when
  * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
- * for ttft, ttlt (=e2el), and itl (and intvty derived from itl) — pick which
- * slice to plot.
+ * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only the
+ * two most commonly read slices (p50, p99) are surfaced in the UI.
  */
 export enum Percentile {
   Median = 'median',
-  P90 = 'p90',
   P99 = 'p99',
-  P99_9 = 'p99.9',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
   [Percentile.Median]: { label: 'p50 (median)' },
-  [Percentile.P90]: { label: 'p90' },
   [Percentile.P99]: { label: 'p99' },
-  [Percentile.P99_9]: { label: 'p99.9' },
 };
 
 export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];
diff --git a/packages/app/src/lib/energy-metrics.test.ts b/packages/app/src/lib/energy-metrics.test.ts
index 28cc1e36..54788585 100644
--- a/packages/app/src/lib/energy-metrics.test.ts
+++ b/packages/app/src/lib/energy-metrics.test.ts
@@ -57,23 +57,33 @@ function makeEntry(overrides: Partial<AggDataEntry> = {}): AggDataEntry {
     mean_ttft: 0.5,
     median_ttft: 0.4,
     std_ttft: 0.1,
+    p90_ttft: 0.7,
     p99_ttft: 0.8,
+    'p99.9_ttft': 0.9,
     mean_tpot: 0.02,
     mean_intvty: 45,
     median_tpot: 0.02,
     median_intvty: 44,
     std_tpot: 0.005,
     std_intvty: 5,
+    p90_tpot: 0.025,
+    p90_intvty: 55,
     p99_tpot: 0.03,
     p99_intvty: 60,
+    'p99.9_tpot': 0.035,
+    'p99.9_intvty': 65,
     mean_itl: 0.01,
     median_itl: 0.01,
     std_itl: 0.002,
+    p90_itl: 0.013,
     p99_itl: 0.015,
+    'p99.9_itl': 0.018,
     mean_e2el: 5,
     median_e2el: 4.8,
     std_e2el: 0.5,
+    p90_e2el: 5.5,
     p99_e2el: 6,
+    'p99.9_e2el': 6.5,
     disagg: false,
     num_prefill_gpu: 0,
     num_decode_gpu: 0,

From 50a06d1419c70ddd8d24b2c6545da44fe6be3a4d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 12:27:19 -0500
Subject: [PATCH 13/55] fix(agentic): default percentile to p99 and drop median
 option

---
 packages/app/src/components/inference/InferenceContext.tsx | 2 +-
 packages/app/src/lib/data-mappings.ts                      | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index b4ccb9ef..af2d364e 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -122,7 +122,7 @@ export function InferenceProvider({
   // Latency percentile applied to the chart x-axis for agentic scenarios.
   // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore.
   const [selectedPercentile, setSelectedPercentile] = useState<string>(
-    () => getUrlParam('i_pctl') || 'median',
+    () => getUrlParam('i_pctl') || 'p99',
   );
   const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>(
     () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto',
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index bf48c864..1b4f47c3 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -186,16 +186,14 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 /**
  * Percentile of the latency distribution used for the chart x-axis when
  * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
- * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only the
- * two most commonly read slices (p50, p99) are surfaced in the UI.
+ * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only p99
+ * is surfaced in the UI.
  */
 export enum Percentile {
-  Median = 'median',
   P99 = 'p99',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
-  [Percentile.Median]: { label: 'p50 (median)' },
   [Percentile.P99]: { label: 'p99' },
 };
 

From 3c96e9137776d1c368a0acdfeee6e769d5733464 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 12:31:27 -0500
Subject: [PATCH 14/55] fix(agentic): keep only p90 as the percentile option

---
 packages/app/src/components/inference/InferenceContext.tsx | 2 +-
 packages/app/src/lib/data-mappings.ts                      | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 0ba14a21..accfdf9e 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -136,7 +136,7 @@ export function InferenceProvider({
   // Latency percentile applied to the chart x-axis for agentic scenarios.
   // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore.
   const [selectedPercentile, setSelectedPercentile] = useState<string>(
-    () => getUrlParam('i_pctl') || 'p99',
+    () => getUrlParam('i_pctl') || 'p90',
   );
   const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>(
     () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto',
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 0afb304a..83e6648a 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -191,12 +191,10 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
  */
 export enum Percentile {
   P90 = 'p90',
-  P99 = 'p99',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
   [Percentile.P90]: { label: 'p90' },
-  [Percentile.P99]: { label: 'p99' },
 };
 
 export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];

From 642081af77c8165ac89a5177abbd6c0244dfb9c0 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Fri, 15 May 2026 13:31:30 -0400
Subject: [PATCH 15/55] fix(agentic): default percentile to p90, surface only
 p90/p99

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/cypress/support/mock-data.ts                | 2 +-
 .../app/src/components/inference/InferenceContext.tsx    | 2 +-
 .../app/src/components/inference/hooks/useChartData.ts   | 9 ++-------
 packages/app/src/components/ui/chart-selectors.tsx       | 2 +-
 packages/app/src/lib/data-mappings.ts                    | 6 ++++--
 packages/app/src/lib/url-state.ts                        | 2 +-
 6 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts
index f267dcc9..34b89aba 100644
--- a/packages/app/cypress/support/mock-data.ts
+++ b/packages/app/cypress/support/mock-data.ts
@@ -189,7 +189,7 @@ export function createMockInferenceContext(
     workflowInfo: null,
     selectedYAxisMetric: 'y_tpPerGpu',
     setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'),
-    selectedPercentile: 'median',
+    selectedPercentile: 'p90',
     setSelectedPercentile: namedStub('setSelectedPercentile'),
     selectedXAxisMetric: null,
     setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'),
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index accfdf9e..36dc672d 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -134,7 +134,7 @@ export function InferenceProvider({
     () => getUrlParam('i_e2e_xmetric') || null,
   );
   // Latency percentile applied to the chart x-axis for agentic scenarios.
-  // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore.
+  // Values: 'p90' | 'p99'. Non-agentic charts ignore.
   const [selectedPercentile, setSelectedPercentile] = useState<string>(
     () => getUrlParam('i_pctl') || 'p90',
   );
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index f2ef85ec..436fd662 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -83,7 +83,7 @@ export function useChartData(
   selectedRunDate?: string,
   enabled = true,
   latestAvailableDate?: string,
-  selectedPercentile = 'median',
+  selectedPercentile = 'p90',
   /** When set, only series for these two registry GPU keys are shown (compare pages). */
   compareGpuPair?: readonly [string, string] | null,
 ) {
@@ -261,12 +261,7 @@ export function useChartData(
             selectedPercentile,
           ) as keyof AggDataEntry;
           if (adjusted !== xAxisField) {
-            const pctlWord =
-              selectedPercentile === 'median'
-                ? 'Median'
-                : selectedPercentile === 'p99.9'
-                  ? 'P99.9'
-                  : selectedPercentile.toUpperCase();
+            const pctlWord = selectedPercentile.toUpperCase();
             xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
             xAxisField = adjusted;
           }
diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index d2940de4..e30816fa 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -315,7 +315,7 @@ export function PercentileSelector({
       <LabelWithTooltip
         htmlFor={id}
         label="Latency Percentile"
-        tooltip="Percentile of the latency distribution used for the chart x-axis. Agentic runs carry median/p90/p99/p99.9 variants; switch percentiles to see tail-latency behavior."
+        tooltip="Percentile of the latency distribution used for the chart x-axis. Switch between p90 and p99 to see tail-latency behavior on agentic runs."
       />
       <Select
         value={value}
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 83e6648a..0970f8d7 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -186,15 +186,17 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 /**
  * Percentile of the latency distribution used for the chart x-axis when
  * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
- * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); p90 and
- * p99 are surfaced in the UI.
+ * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only p90
+ * and p99 are surfaced in the UI.
  */
 export enum Percentile {
   P90 = 'p90',
+  P99 = 'p99',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
   [Percentile.P90]: { label: 'p90' },
+  [Percentile.P99]: { label: 'p99' },
 };
 
 export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index 35ac2359..54ce43d9 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -67,7 +67,7 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   i_seq: '8k/1k',
   i_prec: 'fp4',
   i_metric: 'y_tpPerGpu',
-  i_pctl: 'median',
+  i_pctl: 'p90',
   i_xmetric: 'p99_ttft',
   i_e2e_xmetric: '',
   i_scale: 'auto',

From 3f45f4df92e1990070bf5a58dd7753aa9a91baff Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Fri, 15 May 2026 13:38:23 -0400
Subject: [PATCH 16/55] fix(agentic): drop p99 + median TTFT, p90 only across
 selectors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Aligns the TTFT x-axis selectors with the percentile selector — only
p90 is offered everywhere. Default x-axis metric and chart config
input-throughput x are p90_ttft.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../components/inference/InferenceContext.tsx |  2 +-
 .../inference/hooks/useChartData.ts           | 10 +---
 .../inference/inference-chart-config.json     | 10 ++--
 .../inference/replay/buildReplayTimeline.ts   |  3 +-
 .../components/inference/ui/ChartControls.tsx |  7 +--
 .../components/inference/ui/ChartDisplay.tsx  | 19 ++-----
 .../src/components/inference/utils.test.ts    | 57 +++++++------------
 .../app/src/components/inference/utils.ts     |  3 +-
 .../app/src/components/ui/chart-selectors.tsx |  2 +-
 packages/app/src/lib/data-mappings.ts         |  4 +-
 packages/app/src/lib/url-state.ts             |  2 +-
 11 files changed, 42 insertions(+), 77 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 36dc672d..e88f57d8 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -128,7 +128,7 @@ export function InferenceProvider({
     () => getUrlParam('i_metric') || 'y_tpPerGpu',
   );
   const [selectedXAxisMetric, setSelectedXAxisMetric] = useState<string | null>(
-    () => getUrlParam('i_xmetric') || 'p99_ttft',
+    () => getUrlParam('i_xmetric') || 'p90_ttft',
   );
   const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState<string | null>(
     () => getUrlParam('i_e2e_xmetric') || null,
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 436fd662..69222859 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -215,12 +215,8 @@ export function useChartData(
         // Resolve the effective x-axis override per chart type
         const effectiveXMetric =
           chartDef.chartType === 'e2e' ? selectedE2eXAxisMetric : selectedXAxisMetric;
-        const isTtftOverride =
-          effectiveXMetric === 'p99_ttft' || effectiveXMetric === 'median_ttft';
-        const ttftLabel =
-          effectiveXMetric === 'p99_ttft'
-            ? 'P99 Time To First Token (s)'
-            : 'Median Time To First Token (s)';
+        const isTtftOverride = effectiveXMetric === 'p90_ttft';
+        const ttftLabel = 'P90 Time To First Token (s)';
 
         const isAgentic = selectedSequence === Sequence.AgenticTraces;
 
@@ -340,7 +336,7 @@ export function useChartData(
 
         // Filter to points that have the selected metric, then remap x/y
         const hasMetric = filteredData.some((d) => metricKey in d);
-        const isTtftX = xAxisField === 'p99_ttft' || xAxisField === 'median_ttft';
+        const isTtftX = xAxisField === 'p90_ttft';
         const processedData = hasMetric
           ? filteredData
               .filter((d) => metricKey in d)
diff --git a/packages/app/src/components/inference/inference-chart-config.json b/packages/app/src/components/inference/inference-chart-config.json
index e26d237e..dcd91e60 100644
--- a/packages/app/src/components/inference/inference-chart-config.json
+++ b/packages/app/src/components/inference/inference-chart-config.json
@@ -13,9 +13,9 @@
     "y_inputTputPerGpu_label": "Input Token Throughput per GPU (tok/s/gpu)",
     "y_inputTputPerGpu_title": "Input Token Throughput per GPU",
     "y_inputTputPerGpu_roofline": "upper_left",
-    "y_inputTputPerGpu_x": "p99_ttft",
-    "y_inputTputPerGpu_x_label": "P99 Time To First Token (s)",
-    "y_inputTputPerGpu_heading": "vs. P99 Time To First Token",
+    "y_inputTputPerGpu_x": "p90_ttft",
+    "y_inputTputPerGpu_x_label": "P90 Time To First Token (s)",
+    "y_inputTputPerGpu_heading": "vs. P90 Time To First Token",
     "y_outputTputPerGpu": "outputTputPerGpu.y",
     "y_outputTputPerGpu_label": "Output Token Throughput per GPU (tok/s/gpu)",
     "y_outputTputPerGpu_title": "Output Token Throughput per GPU",
@@ -105,8 +105,8 @@
     "y_inputTputPerGpu_label": "Input Token Throughput per GPU (tok/s/gpu)",
     "y_inputTputPerGpu_title": "Input Token Throughput per GPU",
     "y_inputTputPerGpu_roofline": "upper_right",
-    "y_inputTputPerGpu_x": "p99_ttft",
-    "y_inputTputPerGpu_x_label": "P99 Time To First Token (s)",
+    "y_inputTputPerGpu_x": "p90_ttft",
+    "y_inputTputPerGpu_x_label": "P90 Time To First Token (s)",
     "y_outputTputPerGpu": "outputTputPerGpu.y",
     "y_outputTputPerGpu_label": "Output Token Throughput per GPU (tok/s/gpu)",
     "y_outputTputPerGpu_title": "Output Token Throughput per GPU",
diff --git a/packages/app/src/components/inference/replay/buildReplayTimeline.ts b/packages/app/src/components/inference/replay/buildReplayTimeline.ts
index be076418..b0eb1446 100644
--- a/packages/app/src/components/inference/replay/buildReplayTimeline.ts
+++ b/packages/app/src/components/inference/replay/buildReplayTimeline.ts
@@ -82,8 +82,7 @@ function resolveXAxisField(
   const metricTitle =
     (chartDef[`${selectedYAxisMetric}_title` as keyof ChartDefinition] as string) || '';
   const isInputMetric = metricTitle.toLowerCase().includes('input');
-  const isTtftOverride =
-    selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft';
+  const isTtftOverride = selectedXAxisMetric === 'p90_ttft';
 
   if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
     return selectedXAxisMetric;
diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx
index 7b4fa08f..ad222edc 100644
--- a/packages/app/src/components/inference/ui/ChartControls.tsx
+++ b/packages/app/src/components/inference/ui/ChartControls.tsx
@@ -275,11 +275,11 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
                 <LabelWithTooltip
                   htmlFor="x-axis-select"
                   label="X-Axis Metric"
-                  tooltip="The latency metric displayed on the chart's X-axis. Options include P99 Time To First Token and Median Time To First Token."
+                  tooltip="The latency metric displayed on the chart's X-axis: P90 Time To First Token."
                 />
                 <Select
                   onValueChange={handleXAxisMetricChange}
-                  value={selectedXAxisMetric ?? 'p99_ttft'}
+                  value={selectedXAxisMetric ?? 'p90_ttft'}
                 >
                   <SelectTrigger
                     id="x-axis-select"
@@ -289,8 +289,7 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
                     <SelectValue />
                   </SelectTrigger>
                   <SelectContent portalled={false}>
-                    <SelectItem value="p99_ttft">P99 TTFT</SelectItem>
-                    <SelectItem value="median_ttft">Median TTFT</SelectItem>
+                    <SelectItem value="p90_ttft">P90 TTFT</SelectItem>
                   </SelectContent>
                 </Select>
               </div>
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index f0e1692a..78df2c37 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -408,27 +408,20 @@ export default function ChartDisplay() {
                             if (
                               graph.chartDefinition.chartType === 'interactivity' &&
                               isInputMetric &&
-                              selectedXAxisMetric
+                              selectedXAxisMetric === 'p90_ttft'
                             ) {
-                              if (selectedXAxisMetric === 'p99_ttft') {
-                                return 'vs. P99 Time To First Token';
-                              } else if (selectedXAxisMetric === 'median_ttft') {
-                                return 'vs. Median Time To First Token';
-                              }
+                              return 'vs. P90 Time To First Token';
                             }
 
                             // For e2e chart: render clickable inline dropdown for x-axis
                             if (graph.chartDefinition.chartType === 'e2e') {
                               const xAxisLabel =
-                                selectedE2eXAxisMetric === 'p99_ttft'
-                                  ? 'P99 TTFT'
-                                  : selectedE2eXAxisMetric === 'median_ttft'
-                                    ? 'Median TTFT'
-                                    : 'End-to-end Latency';
+                                selectedE2eXAxisMetric === 'p90_ttft'
+                                  ? 'P90 TTFT'
+                                  : 'End-to-end Latency';
                               const xAxisOptions = [
                                 { value: null, label: 'End-to-end Latency' },
-                                { value: 'p99_ttft', label: 'P99 TTFT' },
-                                { value: 'median_ttft', label: 'Median TTFT' },
+                                { value: 'p90_ttft', label: 'P90 TTFT' },
                               ];
                               const zoomPrefix =
                                 selectedDateRange.startDate &&
diff --git a/packages/app/src/components/inference/utils.test.ts b/packages/app/src/components/inference/utils.test.ts
index 8f8705e1..589ba580 100644
--- a/packages/app/src/components/inference/utils.test.ts
+++ b/packages/app/src/components/inference/utils.test.ts
@@ -157,12 +157,12 @@ describe('processOverlayChartData', () => {
   });
 
   it('remaps x to config override for input metrics on interactivity chart', () => {
-    // inputTputPerGpu has x override to p99_ttft on interactivity chart
+    // inputTputPerGpu has x override to p90_ttft on interactivity chart
     const data = [
       pt({
         x: 100,
         inputTputPerGpu: { y: 5, roof: false },
-        p99_ttft: 0.25,
+        p90_ttft: 0.25,
         median_intvty: 50,
       } as any),
     ];
@@ -176,16 +176,11 @@ describe('processOverlayChartData', () => {
       pt({
         x: 100,
         inputTputPerGpu: { y: 5, roof: false },
-        median_ttft: 0.1,
+        p90_ttft: 0.1,
         median_intvty: 50,
       } as any),
     ];
-    const result = processOverlayChartData(
-      data,
-      'interactivity',
-      'y_inputTputPerGpu',
-      'median_ttft',
-    );
+    const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft');
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.1);
   });
@@ -195,76 +190,62 @@ describe('processOverlayChartData', () => {
       pt({
         x: 100,
         inputTputPerGpu: { y: 5, roof: false },
-        p99_ttft: 0.25,
+        p90_ttft: 0.25,
         median_e2el: 2.5,
       } as any),
     ];
     const result = processOverlayChartData(data, 'e2e', 'y_inputTputPerGpu', null);
     expect(result).toHaveLength(1);
-    // e2e uses median_e2el as x (from chart config default), not p99_ttft
+    // e2e uses median_e2el as x (from chart config default), not p90_ttft
     expect(result[0].x).toBe(2.5);
   });
 
-  it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p99_ttft', () => {
-    const data = [
-      pt({
-        x: 100,
-        tpPerGpu: { y: 42, roof: false },
-        p99_ttft: 0.35,
-        median_e2el: 2.5,
-      } as any),
-    ];
-    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft');
-    expect(result).toHaveLength(1);
-    expect(result[0].x).toBe(0.35);
-  });
-
-  it('remaps x to TTFT for e2e chart when selectedXAxisMetric is median_ttft', () => {
+  it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p90_ttft', () => {
     const data = [
       pt({
         x: 100,
         tpPerGpu: { y: 42, roof: false },
-        median_ttft: 0.12,
+        p90_ttft: 0.12,
         median_e2el: 2.5,
       } as any),
     ];
-    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'median_ttft');
+    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft');
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.12);
   });
 
   it('filters e2e TTFT outliers exceeding y_latency_limit', () => {
     const data = [
-      pt({ tpPerGpu: { y: 10, roof: false }, p99_ttft: 0.5, median_e2el: 1 } as any),
-      pt({ tpPerGpu: { y: 5, roof: false }, p99_ttft: 999, median_e2el: 2 } as any),
+      pt({ tpPerGpu: { y: 10, roof: false }, p90_ttft: 0.5, median_e2el: 1 } as any),
+      pt({ tpPerGpu: { y: 5, roof: false }, p90_ttft: 999, median_e2el: 2 } as any),
     ];
-    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft');
+    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft');
     // y_latency_limit is 60 in the e2e chart config — the 999 outlier should be filtered
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.5);
   });
 
   it('does not filter interactivity points by latency limit when x-axis is default', () => {
-    // Regression: selectedXAxisMetric defaults to 'p99_ttft' but the interactivity
+    // Regression: selectedXAxisMetric defaults to 'p90_ttft' but the interactivity
     // chart's x-axis stays median_intvty for non-input metrics. The latency limit
     // (60) must NOT apply to median_intvty values.
     const data = [
       pt({ tpPerGpu: { y: 42, roof: false }, median_intvty: 200 } as any),
       pt({ tpPerGpu: { y: 10, roof: false }, median_intvty: 30 } as any),
     ];
-    const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p99_ttft');
+    const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p90_ttft');
     expect(result).toHaveLength(2);
   });
 
   it('applies latency limit on interactivity only when x-axis is actually overridden', () => {
-    // When an input metric IS selected and x-axis overrides to p99_ttft,
+    // When an input metric IS selected and x-axis overrides to p90_ttft,
     // the latency limit should apply.
     const data = [
-      pt({ inputTputPerGpu: { y: 5, roof: false }, p99_ttft: 0.5, median_intvty: 10 } as any),
-      pt({ inputTputPerGpu: { y: 3, roof: false }, p99_ttft: 999, median_intvty: 20 } as any),
+      pt({ inputTputPerGpu: { y: 5, roof: false }, p90_ttft: 0.5, median_intvty: 10 } as any),
+      pt({ inputTputPerGpu: { y: 3, roof: false }, p90_ttft: 999, median_intvty: 20 } as any),
     ];
-    const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p99_ttft');
-    // x-axis is overridden to p99_ttft for input metric — latency limit SHOULD filter 999
+    const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft');
+    // x-axis is overridden to p90_ttft for input metric — latency limit SHOULD filter 999
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.5);
   });
diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts
index 4b5335b6..735007ab 100644
--- a/packages/app/src/components/inference/utils.ts
+++ b/packages/app/src/components/inference/utils.ts
@@ -88,8 +88,7 @@ export function processOverlayChartData(
   let xAxisField: string = chartDef.x;
   // selectedXAxisMetric is already the effective metric for this chart type
   // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric)
-  const isTtftOverride =
-    selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft';
+  const isTtftOverride = selectedXAxisMetric === 'p90_ttft';
 
   if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
     xAxisField = selectedXAxisMetric;
diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index e30816fa..19b4bfb0 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -315,7 +315,7 @@ export function PercentileSelector({
       <LabelWithTooltip
         htmlFor={id}
         label="Latency Percentile"
-        tooltip="Percentile of the latency distribution used for the chart x-axis. Switch between p90 and p99 to see tail-latency behavior on agentic runs."
+        tooltip="Percentile of the latency distribution used for the chart x-axis on agentic runs."
       />
       <Select
         value={value}
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 0970f8d7..91f65a34 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -187,16 +187,14 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
  * Percentile of the latency distribution used for the chart x-axis when
  * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
  * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only p90
- * and p99 are surfaced in the UI.
+ * is surfaced in the UI.
  */
 export enum Percentile {
   P90 = 'p90',
-  P99 = 'p99',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
   [Percentile.P90]: { label: 'p90' },
-  [Percentile.P99]: { label: 'p99' },
 };
 
 export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index 54ce43d9..b88c92b2 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -68,7 +68,7 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   i_prec: 'fp4',
   i_metric: 'y_tpPerGpu',
   i_pctl: 'p90',
-  i_xmetric: 'p99_ttft',
+  i_xmetric: 'p90_ttft',
   i_e2e_xmetric: '',
   i_scale: 'auto',
   i_gpus: '',

From 03c775ac9710b4a95d2d2c270adfcfe202219130 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Fri, 15 May 2026 13:41:14 -0400
Subject: [PATCH 17/55] fix(agentic): honor e2e TTFT override in agentic mode
 too

The `!isAgentic` gate on the e2e TTFT override branch dropped the
user's `p90_ttft` pick in agentic mode, leaving the chart on the
default p90_e2el. The trailing withPercentile pass is idempotent
when xAxisField is already at the right percentile, so the gate is
unnecessary.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/inference/hooks/useChartData.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 69222859..2a344cef 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -242,7 +242,7 @@ export function useChartData(
           const xLabelOverrideKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition;
           xAxisField = (chartDef[xOverrideKey] as keyof AggDataEntry) || chartDef.x;
           xAxisLabel = (chartDef[xLabelOverrideKey] as string) || chartDef.x_label;
-        } else if (chartDef.chartType === 'e2e' && isTtftOverride && !isAgentic) {
+        } else if (chartDef.chartType === 'e2e' && isTtftOverride) {
           xAxisField = effectiveXMetric as keyof AggDataEntry;
           xAxisLabel = ttftLabel;
         }

From 49f2b2780d71cdad7b4a52ae0fdab0e2b8013d09 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Fri, 15 May 2026 13:45:19 -0400
Subject: [PATCH 18/55] fix(agentic): default e2e chart x-axis to p90 TTFT

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/inference/InferenceContext.tsx | 2 +-
 packages/app/src/lib/url-state.ts                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index e88f57d8..c80afc2e 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -131,7 +131,7 @@ export function InferenceProvider({
     () => getUrlParam('i_xmetric') || 'p90_ttft',
   );
   const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState<string | null>(
-    () => getUrlParam('i_e2e_xmetric') || null,
+    () => getUrlParam('i_e2e_xmetric') || 'p90_ttft',
   );
   // Latency percentile applied to the chart x-axis for agentic scenarios.
   // Values: 'p90' | 'p99'. Non-agentic charts ignore.
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index b88c92b2..4a48a776 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -69,7 +69,7 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   i_metric: 'y_tpPerGpu',
   i_pctl: 'p90',
   i_xmetric: 'p90_ttft',
-  i_e2e_xmetric: '',
+  i_e2e_xmetric: 'p90_ttft',
   i_scale: 'auto',
   i_gpus: '',
   i_dates: '',

From 9e2c5322b0873ecd8ba8720d7e7e21961a7178dd Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 12:47:22 -0500
Subject: [PATCH 19/55] fix(tooltip): cap data-point numeric values at 3
 decimal places

---
 .../inference/utils/tooltipUtils.ts           | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index 4359fc44..3154070a 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -91,6 +91,14 @@ const tooltipLine = (label: string, value: string | number) =>
 const formatPct = (v: number | undefined): string | null =>
   v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`;
 
+/** Tooltip numeric values are capped at 3 decimal places (trailing zeros stripped). */
+const fmt = (v: number): string => {
+  if (!Number.isFinite(v)) return String(v);
+  const rounded = parseFloat(v.toFixed(3));
+  if (Math.abs(rounded) >= 10000) return new Intl.NumberFormat('en-US').format(rounded);
+  return String(rounded);
+};
+
 /**
  * Agentic-only tooltip rows: offload mode, KV cache hit rates, request
  * success, token totals. Returns an empty string for non-agentic rows.
@@ -201,16 +209,16 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
           : ''
       }
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${xLabel}:</strong> ${formatNumber(d.x)}
+        <strong>${xLabel}:</strong> ${fmt(d.x)}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${yLabel}:</strong> ${formatNumber(d.y)}
+        <strong>${yLabel}:</strong> ${fmt(d.y)}
       </div>
       ${
         selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Input Token Throughput per GPU:</strong> ${formatNumber(d['inputTputPerGpu'].y)}
+            <strong>Input Token Throughput per GPU:</strong> ${fmt(d['inputTputPerGpu'].y)}
           </div>`
           : ''
       }
@@ -218,7 +226,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
         selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Output Token Throughput per GPU:</strong> ${formatNumber(d['outputTputPerGpu'].y)}
+            <strong>Output Token Throughput per GPU:</strong> ${fmt(d['outputTputPerGpu'].y)}
           </div>`
           : ''
       }
@@ -274,10 +282,10 @@ export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): str
         <strong>Date:</strong> ${d.actualDate ?? d.date}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${xLabel}:</strong> ${formatNumber(d.x)}
+        <strong>${xLabel}:</strong> ${fmt(d.x)}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${yLabel}:</strong> ${formatNumber(d.y)}
+        <strong>${yLabel}:</strong> ${fmt(d.y)}
       </div>
       ${tooltipLine('Total GPUs', d.tp)}
       ${generateParallelismHTML(d)}
@@ -318,16 +326,16 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
           : ''
       }
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${xLabel}:</strong> ${formatNumber(d.x)}
+        <strong>${xLabel}:</strong> ${fmt(d.x)}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${yLabel}:</strong> ${formatNumber(d.y)}
+        <strong>${yLabel}:</strong> ${fmt(d.y)}
       </div>
       ${
         selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Input Token Throughput per GPU:</strong> ${formatNumber(d['inputTputPerGpu'].y)}
+            <strong>Input Token Throughput per GPU:</strong> ${fmt(d['inputTputPerGpu'].y)}
           </div>`
           : ''
       }
@@ -335,7 +343,7 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
         selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Output Token Throughput per GPU:</strong> ${formatNumber(d['outputTputPerGpu'].y)}
+            <strong>Output Token Throughput per GPU:</strong> ${fmt(d['outputTputPerGpu'].y)}
           </div>`
           : ''
       }

From 50ed25fa95e36d2ad881a1f68aa70010a19f34de Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 17:05:09 -0500
Subject: [PATCH 20/55] fix(agentic): relabel x-axis title for natural-x case
 too

---
 .../components/inference/hooks/useChartData.ts    | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 2a344cef..b14775b6 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -248,19 +248,16 @@ export function useChartData(
         }
 
         // Agentic: rewrite the resolved x metric to the chosen percentile,
-        // and relabel accordingly. naturalX is already percentile-adjusted,
-        // so the per-metric override path is the only one that actually
-        // changes here.
+        // and relabel accordingly. Both have to be updated unconditionally —
+        // xAxisField may already be percentile-adjusted (via naturalX) while
+        // xAxisLabel still carries the raw chartDef.x_label prefix.
         if (isAgentic) {
-          const adjusted = withPercentile(
+          xAxisField = withPercentile(
             xAxisField as string,
             selectedPercentile,
           ) as keyof AggDataEntry;
-          if (adjusted !== xAxisField) {
-            const pctlWord = selectedPercentile.toUpperCase();
-            xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
-            xAxisField = adjusted;
-          }
+          const pctlWord = selectedPercentile.toUpperCase();
+          xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
         }
 
         // The x-axis is "flipped" only when the good-direction reverses

From e9d8e3f66143fcdce8709f4a55bd0f29889d7174 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 17:08:05 -0500
Subject: [PATCH 21/55] fix(agentic): include percentile word in chart heading

---
 .../app/src/components/inference/hooks/useChartData.ts |  9 +++++++++
 .../app/src/components/inference/ui/ChartDisplay.tsx   | 10 ++++------
 .../components/inference/ui/UnofficialChartDisplay.tsx |  4 +---
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index b14775b6..0d13b8ca 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -251,6 +251,10 @@ export function useChartData(
         // and relabel accordingly. Both have to be updated unconditionally —
         // xAxisField may already be percentile-adjusted (via naturalX) while
         // xAxisLabel still carries the raw chartDef.x_label prefix.
+        // The chart heading ("vs. <latency>") is also rewritten to include
+        // the percentile so the title above the plot reflects what's drawn.
+        const headingKey = `${selectedYAxisMetric}_heading` as keyof ChartDefinition;
+        let chartHeading = (chartDef[headingKey] as string) || chartDef.heading;
         if (isAgentic) {
           xAxisField = withPercentile(
             xAxisField as string,
@@ -258,6 +262,10 @@ export function useChartData(
           ) as keyof AggDataEntry;
           const pctlWord = selectedPercentile.toUpperCase();
           xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
+          chartHeading = chartHeading.replace(
+            /^(vs\.\s+)(?:(Median|Mean|P90|P99(?:\.9)?)\s+)?/iu,
+            `$1${pctlWord} `,
+          );
         }
 
         // The x-axis is "flipped" only when the good-direction reverses
@@ -288,6 +296,7 @@ export function useChartData(
           chartDefinition: {
             ...chartDef,
             ...rooflineOverrides,
+            heading: chartHeading,
             x_label: xAxisLabel,
             y_label: dynamicYLabel === null ? undefined : String(dynamicYLabel),
           },
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 78df2c37..35213a14 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -449,12 +449,10 @@ export default function ChartDisplay() {
                               );
                             }
 
-                            // Fall back to configured heading
-                            return (
-                              graph.chartDefinition[
-                                `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
-                              ] || graph.chartDefinition.heading
-                            );
+                            // Fall back to the heading baked into chartDefinition
+                            // by useChartData (already resolves per-metric overrides
+                            // and applies the agentic percentile rewrite).
+                            return graph.chartDefinition.heading;
                           })()}
                         </h2>
                         <p className="text-sm text-muted-foreground mb-2">
diff --git a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
index f9b1b3c8..73018483 100644
--- a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
@@ -194,9 +194,7 @@ export function UnofficialChartDisplay() {
                           `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
                         ]
                       }{' '}
-                      {graph.chartDefinition[
-                        `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
-                      ] || graph.chartDefinition.heading}
+                      {graph.chartDefinition.heading}
                     </h2>
                     <p className="text-sm text-muted-foreground mb-2">
                       {graph.model} • {selectedPrecisions.join(', ')} • {graph.sequence}

From 2046282eb3386bd0e7164b57a3f5dace9465e169 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 17:15:24 -0500
Subject: [PATCH 22/55] fix(agentic): include percentile in e2e chart heading
 dropdown

---
 .../src/components/inference/ui/ChartDisplay.tsx    | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 35213a14..e9021aed 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -40,6 +40,7 @@ import {
   getModelLabel,
   getPrecisionLabel,
   getSequenceLabel,
+  sequenceKind,
 } from '@/lib/data-mappings';
 import { useComparisonChangelogs } from '@/hooks/api/use-comparison-changelogs';
 import { useTrendData } from '@/components/inference/hooks/useTrendData';
@@ -152,6 +153,7 @@ export default function ChartDisplay() {
     activeHwTypes,
     activeDates,
     setSelectedE2eXAxisMetric,
+    selectedPercentile,
     compareGpuPair,
   } = useInference();
 
@@ -415,12 +417,15 @@ export default function ChartDisplay() {
 
                             // For e2e chart: render clickable inline dropdown for x-axis
                             if (graph.chartDefinition.chartType === 'e2e') {
+                              const isAgentic = sequenceKind(selectedSequence) === 'agentic';
+                              const pctlWord = selectedPercentile.toUpperCase();
+                              const e2elLabel = isAgentic
+                                ? `${pctlWord} End-to-end Latency`
+                                : 'End-to-end Latency';
                               const xAxisLabel =
-                                selectedE2eXAxisMetric === 'p90_ttft'
-                                  ? 'P90 TTFT'
-                                  : 'End-to-end Latency';
+                                selectedE2eXAxisMetric === 'p90_ttft' ? 'P90 TTFT' : e2elLabel;
                               const xAxisOptions = [
-                                { value: null, label: 'End-to-end Latency' },
+                                { value: null, label: e2elLabel },
                                 { value: 'p90_ttft', label: 'P90 TTFT' },
                               ];
                               const zoomPrefix =

From 9957f19e630c14fbfadb411725ba1736d58a83e1 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 18:53:56 -0500
Subject: [PATCH 23/55] feat(agentic): per-point trace_replay storage + detail
 page POC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Persist aiperf's profile_export.jsonl and server_metrics_export.{csv,json}
per agentic benchmark point in a new agentic_trace_replay sibling table
(migration 006), then a follow-up column for the gzipped time-series JSON
(migration 007). Ingest hook walks the agentic_<suffix> sibling artifact
and captures all three files; ~6 MB gz per point.

New /inference/agentic/[id] detail page renders:
- ISL / OSL histograms with p50/p75/p90/p95 guide lines
- KV cache utilization over time (raw scatter + 50-sample rolling avg)
- Request queue depth (running / waiting / total, smoothed)
- Prefix cache hit rate per interval (raw scatter + smoothed)
- Total + decode throughput with cumulative running-avg overlay
- Cumulative prompt token source breakdown (stacked area)

SiblingNav at the top renders the SKU label (e.g. "B200 · DeepSeek V4 Pro
· FP4 · vLLM") with chips for every (TP, conc, offload) variant in the
same workflow run so users can jump between sibling points.

Tooltip changes:
- portal to document.body + position:fixed so the tooltip can escape
  parent stacking contexts (backdrop-filter on the chart Card)
- clamp positioning to keep the tooltip inside the chart area
- "View charts →" button on pinned agentic points navigates to the
  detail page

Also ignores .claude/worktrees/ from oxlint so parallel agent worktrees
don't trip the pre-commit hook.
---
 .eslintignore                                 |   3 +
 .../inference/agentic/[id]/page.tsx           |  17 +
 .../app/src/app/api/unofficial-run/route.ts   |   4 +
 .../app/api/v1/benchmark-siblings/route.ts    |  38 +++
 .../src/app/api/v1/trace-histograms/route.ts  |  60 ++++
 .../app/api/v1/trace-server-metrics/route.ts  |  40 +++
 .../agentic-point/agentic-point-detail.tsx    | 308 +++++++++++++++++
 .../inference/agentic-point/distribution.tsx  | 140 ++++++++
 .../inference/agentic-point/sibling-nav.tsx   | 118 +++++++
 .../agentic-point/time-series-chart.tsx       | 311 ++++++++++++++++++
 .../app/src/components/inference/types.ts     |   2 +
 .../components/inference/ui/ScatterGraph.tsx  | 225 +++++++++----
 .../inference/utils/tooltipUtils.ts           |  34 +-
 .../src/components/ui/d3-chart-wrapper.tsx    |  53 ++-
 .../unofficial-run-provider.test.ts           |   1 +
 .../src/hooks/api/use-benchmark-siblings.ts   |  46 +++
 .../app/src/hooks/api/use-trace-histograms.ts |  39 +++
 .../src/hooks/api/use-trace-server-metrics.ts |  70 ++++
 packages/app/src/lib/api.ts                   |   2 +
 .../app/src/lib/benchmark-transform.test.ts   |   1 +
 packages/app/src/lib/benchmark-transform.ts   |   2 +
 .../app/src/lib/compare-pair-defaults.test.ts |   1 +
 .../src/lib/d3-chart/layers/scatter-points.ts |  30 +-
 .../migrations/006_agentic_trace_replay.sql   |  34 ++
 .../007_agentic_trace_server_metrics_json.sql |  17 +
 packages/db/src/etl/skip-tracker.test.ts      |   1 +
 packages/db/src/etl/skip-tracker.ts           |   3 +
 packages/db/src/etl/trace-replay-ingest.ts    |  83 +++++
 packages/db/src/ingest-ci-run.ts              |  90 +++++
 packages/db/src/ingest-gcs-backup.ts          |   2 +
 packages/db/src/json-provider.ts              |   1 +
 packages/db/src/queries/benchmark-siblings.ts | 132 ++++++++
 packages/db/src/queries/benchmarks.ts         |   9 +
 packages/db/src/queries/trace-histograms.ts   |  82 +++++
 .../db/src/queries/trace-server-metrics.ts    | 275 ++++++++++++++++
 35 files changed, 2196 insertions(+), 78 deletions(-)
 create mode 100644 .eslintignore
 create mode 100644 packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
 create mode 100644 packages/app/src/app/api/v1/benchmark-siblings/route.ts
 create mode 100644 packages/app/src/app/api/v1/trace-histograms/route.ts
 create mode 100644 packages/app/src/app/api/v1/trace-server-metrics/route.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/distribution.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/sibling-nav.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/time-series-chart.tsx
 create mode 100644 packages/app/src/hooks/api/use-benchmark-siblings.ts
 create mode 100644 packages/app/src/hooks/api/use-trace-histograms.ts
 create mode 100644 packages/app/src/hooks/api/use-trace-server-metrics.ts
 create mode 100644 packages/db/migrations/006_agentic_trace_replay.sql
 create mode 100644 packages/db/migrations/007_agentic_trace_server_metrics_json.sql
 create mode 100644 packages/db/src/etl/trace-replay-ingest.ts
 create mode 100644 packages/db/src/queries/benchmark-siblings.ts
 create mode 100644 packages/db/src/queries/trace-histograms.ts
 create mode 100644 packages/db/src/queries/trace-server-metrics.ts

diff --git a/.eslintignore b/.eslintignore
new file mode 100644
index 00000000..513a873e
--- /dev/null
+++ b/.eslintignore
@@ -0,0 +1,3 @@
+# Stale agent worktrees produced by parallel Claude Code sessions — they
+# hold their own branches and are linted as part of their own runs.
+.claude/worktrees/
diff --git a/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
new file mode 100644
index 00000000..77f29805
--- /dev/null
+++ b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
@@ -0,0 +1,17 @@
+import type { Metadata } from 'next';
+
+import { AgenticPointDetail } from '@/components/inference/agentic-point/agentic-point-detail';
+
+export const metadata: Metadata = {
+  title: 'Agentic trace detail | InferenceX',
+  robots: { index: false },
+};
+
+export default async function AgenticPointDetailPage({
+  params,
+}: {
+  params: Promise<{ id: string }>;
+}) {
+  const { id } = await params;
+  return <AgenticPointDetail id={Number(id)} />;
+}
diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts
index 7578e897..3d2d0da7 100644
--- a/packages/app/src/app/api/unofficial-run/route.ts
+++ b/packages/app/src/app/api/unofficial-run/route.ts
@@ -33,6 +33,10 @@ export function normalizeArtifactRows(
     if (!params) continue;
     const { config } = params;
     results.push({
+      // Synthetic id — overlay rows aren't persisted, so trace_replay lookups
+      // (keyed on benchmark_results.id) will always miss, which is the
+      // intended behaviour: overlays never have stored trace_replay blobs.
+      id: 0,
       hardware: config.hardware,
       framework: config.framework,
       model: config.model,
diff --git a/packages/app/src/app/api/v1/benchmark-siblings/route.ts b/packages/app/src/app/api/v1/benchmark-siblings/route.ts
new file mode 100644
index 00000000..14c1d461
--- /dev/null
+++ b/packages/app/src/app/api/v1/benchmark-siblings/route.ts
@@ -0,0 +1,38 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getBenchmarkSiblings,
+  type BenchmarkSiblings,
+} from '@semianalysisai/inferencex-db/queries/benchmark-siblings';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedSiblings = cachedQuery(
+  (id: number): Promise<BenchmarkSiblings | null> => getBenchmarkSiblings(getDb(), id),
+  'benchmark-siblings',
+);
+
+/**
+ * GET /api/v1/benchmark-siblings?id=N
+ *
+ * Returns the SKU (hw/framework/model/precision/spec/benchmark_type) of the
+ * benchmark_result + all sibling rows that share that SKU within the same
+ * workflow_run. Used by the agentic detail page to render a navigator.
+ */
+export async function GET(request: NextRequest) {
+  const id = Number(request.nextUrl.searchParams.get('id'));
+  if (!id || !Number.isFinite(id)) {
+    return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 });
+  }
+  try {
+    const data = await getCachedSiblings(id);
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching benchmark siblings:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/trace-histograms/route.ts b/packages/app/src/app/api/v1/trace-histograms/route.ts
new file mode 100644
index 00000000..fd7572a8
--- /dev/null
+++ b/packages/app/src/app/api/v1/trace-histograms/route.ts
@@ -0,0 +1,60 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getTraceHistograms,
+  type TraceHistogramMap,
+} from '@semianalysisai/inferencex-db/queries/trace-histograms';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedTraceHistograms = cachedQuery(
+  (ids: number[]): Promise<TraceHistogramMap> => getTraceHistograms(getDb(), ids),
+  'trace-histograms',
+);
+
+const MAX_IDS_PER_REQUEST = 200;
+
+/**
+ * GET /api/v1/trace-histograms?ids=1,2,3
+ *
+ * Returns per-request ISL/OSL arrays parsed from the stored aiperf
+ * `profile_export.jsonl` blobs, keyed by `benchmark_results.id`.
+ * Ids without a trace_replay blob are omitted from the response.
+ */
+export async function GET(request: NextRequest) {
+  const raw = request.nextUrl.searchParams.get('ids');
+  if (!raw) {
+    return NextResponse.json({ error: 'ids query param is required' }, { status: 400 });
+  }
+
+  const ids = [
+    ...new Set(
+      raw
+        .split(',')
+        .map((s) => Number(s.trim()))
+        .filter((n) => Number.isFinite(n) && n > 0),
+    ),
+  ];
+  if (ids.length === 0) {
+    return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 });
+  }
+  if (ids.length > MAX_IDS_PER_REQUEST) {
+    return NextResponse.json(
+      { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` },
+      { status: 400 },
+    );
+  }
+
+  try {
+    // Sort the cache key so the same set of ids in any order hits the same entry.
+    const sorted = [...ids].toSorted((a, b) => a - b);
+    const histograms = await getCachedTraceHistograms(sorted);
+    return cachedJson(histograms);
+  } catch (error) {
+    console.error('Error fetching trace histograms:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/trace-server-metrics/route.ts b/packages/app/src/app/api/v1/trace-server-metrics/route.ts
new file mode 100644
index 00000000..7346a3e8
--- /dev/null
+++ b/packages/app/src/app/api/v1/trace-server-metrics/route.ts
@@ -0,0 +1,40 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getTraceServerMetrics,
+  type TraceServerMetrics,
+} from '@semianalysisai/inferencex-db/queries/trace-server-metrics';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedTraceServerMetrics = cachedQuery(
+  (id: number): Promise<TraceServerMetrics | null> => getTraceServerMetrics(getDb(), id),
+  'trace-server-metrics',
+  { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/trace-server-metrics?id=N
+ *
+ * Returns parsed time-series for the agentic detail view: KV cache usage,
+ * prefix cache hit rate per interval, queue depth, and per-source prompt
+ * token rates. Times are in seconds from benchmark start. 404 if the point
+ * has no stored server_metrics_export.json blob.
+ */
+export async function GET(request: NextRequest) {
+  const id = Number(request.nextUrl.searchParams.get('id'));
+  if (!id || !Number.isFinite(id)) {
+    return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 });
+  }
+  try {
+    const data = await getCachedTraceServerMetrics(id);
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching trace server metrics:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
new file mode 100644
index 00000000..3cd274ba
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -0,0 +1,308 @@
+'use client';
+
+import Link from 'next/link';
+import { useRouter } from 'next/navigation';
+import { ArrowLeft } from 'lucide-react';
+
+import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
+import {
+  useTraceServerMetrics,
+  type PointMeta,
+  type QueueDepthPoint,
+  type TimeSeriesPoint,
+} from '@/hooks/api/use-trace-server-metrics';
+import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings';
+
+import { Distribution } from './distribution';
+import { SiblingNav } from './sibling-nav';
+import {
+  StackedAreaChart,
+  TimeSeriesChart,
+  cumulativeAverage,
+  rollingAverage,
+  sumSeries,
+} from './time-series-chart';
+
+interface Props {
+  id: number;
+}
+
+const fmtPct = (v: number | null | undefined): string =>
+  v === null || v === undefined || Number.isNaN(v) ? '—' : `${(v * 100).toFixed(2)}%`;
+
+function MetaLine({ label, value }: { label: string; value: React.ReactNode }) {
+  return (
+    <div className="flex flex-col gap-0.5">
+      <span className="text-xs uppercase tracking-wide text-muted-foreground">{label}</span>
+      <span className="text-sm font-medium text-foreground">{value}</span>
+    </div>
+  );
+}
+
+function PointSummary({ meta }: { meta: PointMeta }) {
+  return (
+    <div className="mb-4">
+      <div className="flex items-baseline justify-between gap-3 mb-2">
+        <p className="text-sm text-muted-foreground">
+          Selected point
+          {meta.disagg ? ' · disagg' : ''}
+          {meta.spec_method && meta.spec_method !== 'none' ? ` · spec=${meta.spec_method}` : ''}
+        </p>
+        {meta.run_url && (
+          <a
+            href={meta.run_url}
+            target="_blank"
+            rel="noopener noreferrer"
+            className="text-xs text-muted-foreground hover:text-foreground underline"
+          >
+            GitHub Actions run →
+          </a>
+        )}
+      </div>
+      <div className="grid grid-cols-2 sm:grid-cols-3 lg:grid-cols-6 gap-3">
+        <MetaLine label="Offload" value={(meta.offload_mode ?? 'off').toUpperCase()} />
+        <MetaLine label="Concurrency" value={meta.conc} />
+        <MetaLine label="GPU cache hit" value={fmtPct(meta.server_gpu_cache_hit_rate)} />
+        <MetaLine label="CPU cache hit" value={fmtPct(meta.server_cpu_cache_hit_rate)} />
+        {meta.isl !== null && <MetaLine label="ISL" value={meta.isl} />}
+        {meta.osl !== null && <MetaLine label="OSL" value={meta.osl} />}
+      </div>
+    </div>
+  );
+}
+
+function ChartCard({ title, children }: { title: string; children: React.ReactNode }) {
+  return (
+    <div className="rounded-lg border border-border/40 bg-card/40 p-4">
+      <h2 className="text-sm font-semibold text-foreground mb-3">{title}</h2>
+      {children}
+    </div>
+  );
+}
+
+export function AgenticPointDetail({ id }: Props) {
+  const router = useRouter();
+  const histQuery = useTraceHistograms([id], true);
+  const metricsQuery = useTraceServerMetrics(id, true);
+  const siblingsQuery = useBenchmarkSiblings(id);
+
+  const hist = histQuery.data?.[id];
+  const metrics = metricsQuery.data;
+  const siblingsData = siblingsQuery.data;
+
+  return (
+    <div className="container mx-auto px-4 lg:px-8 flex flex-col gap-4 py-6">
+      <div className="flex items-center gap-2">
+        <button
+          type="button"
+          onClick={() => router.back()}
+          className="inline-flex items-center gap-1 text-sm text-muted-foreground hover:text-foreground"
+        >
+          <ArrowLeft className="size-4" /> Back
+        </button>
+        <span className="text-sm text-muted-foreground">·</span>
+        <Link href="/inference" className="text-sm text-muted-foreground hover:text-foreground">
+          Inference chart
+        </Link>
+      </div>
+
+      {siblingsData ? (
+        <SiblingNav sku={siblingsData.sku} siblings={siblingsData.siblings} />
+      ) : siblingsQuery.isLoading ? (
+        <div className="text-sm text-muted-foreground">Loading SKU navigator…</div>
+      ) : null}
+
+      {metrics ? (
+        <PointSummary meta={metrics.meta} />
+      ) : metricsQuery.isLoading ? (
+        <div className="text-sm text-muted-foreground">Loading point metadata…</div>
+      ) : null}
+
+      {metricsQuery.isError && (
+        <div className="rounded-lg border border-destructive/40 bg-destructive/10 p-4 text-sm text-destructive">
+          Failed to load trace data for benchmark point #{id}.
+        </div>
+      )}
+      {metricsQuery.data === null && !metricsQuery.isLoading && (
+        <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+          No stored trace_replay blob for benchmark point #{id}. This point predates the aiperf
+          time-series capture, or its source artifacts have expired on GitHub.
+        </div>
+      )}
+
+      <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
+        <ChartCard title="Input sequence length distribution">
+          {hist ? (
+            <Distribution values={hist.isl} unit="tokens" />
+          ) : histQuery.isLoading ? (
+            <Skeleton />
+          ) : (
+            <Empty />
+          )}
+        </ChartCard>
+        <ChartCard title="Output sequence length distribution">
+          {hist ? (
+            <Distribution values={hist.osl} unit="tokens" />
+          ) : histQuery.isLoading ? (
+            <Skeleton />
+          ) : (
+            <Empty />
+          )}
+        </ChartCard>
+
+        <ChartCard title="KV cache utilization over time">
+          {metrics ? (
+            <TimeSeriesChart
+              series={[
+                {
+                  name: 'GPU KV cache (avg n=50)',
+                  data: rollingAverage(metrics.kvCacheUsage, 50),
+                  rawData: metrics.kvCacheUsage,
+                  color: '#3b82f6',
+                  strokeWidth: 2,
+                },
+              ]}
+              durationS={metrics.durationS}
+              yMax={1}
+              yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+              yAxisLabel="KV cache (%)"
+            />
+          ) : (
+            <Skeleton />
+          )}
+        </ChartCard>
+
+        <ChartCard title="Request queue depth">
+          {metrics ? (
+            <TimeSeriesChart
+              series={[
+                {
+                  name: 'Running (avg n=50)',
+                  data: rollingAverage(
+                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                      t: p.t,
+                      value: p.running,
+                    })),
+                    50,
+                  ),
+                  color: '#22c55e',
+                  strokeWidth: 2,
+                },
+                {
+                  name: 'Waiting (avg n=50)',
+                  data: rollingAverage(
+                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                      t: p.t,
+                      value: p.waiting,
+                    })),
+                    50,
+                  ),
+                  color: '#ef4444',
+                  strokeWidth: 2,
+                },
+                {
+                  name: 'Total (avg n=50)',
+                  data: rollingAverage(
+                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                      t: p.t,
+                      value: p.total,
+                    })),
+                    50,
+                  ),
+                  color: '#3b82f6',
+                  strokeWidth: 2,
+                },
+              ]}
+              durationS={metrics.durationS}
+              yAxisLabel="Requests"
+            />
+          ) : (
+            <Skeleton />
+          )}
+        </ChartCard>
+
+        <ChartCard title="Prefix cache hit rate per interval">
+          {metrics ? (
+            <TimeSeriesChart
+              series={[
+                {
+                  name: 'GPU (HBM, avg n=50)',
+                  data: rollingAverage(metrics.prefixCacheHitRate, 50),
+                  rawData: metrics.prefixCacheHitRate,
+                  color: '#a855f7',
+                  strokeWidth: 2,
+                },
+              ]}
+              durationS={metrics.durationS}
+              yMax={1}
+              yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+              yAxisLabel="Hit rate (%)"
+            />
+          ) : (
+            <Skeleton />
+          )}
+        </ChartCard>
+
+        <ChartCard title="Throughput (total & decode)">
+          {metrics ? (
+            (() => {
+              const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'Total (avg n=50)',
+                      data: rollingAverage(total, 50),
+                      color: '#3b82f6',
+                      strokeWidth: 1.6,
+                    },
+                    {
+                      name: 'Decode (avg n=50)',
+                      data: rollingAverage(metrics.decodeTps, 50),
+                      color: '#f97316',
+                      strokeWidth: 1.6,
+                    },
+                    {
+                      name: 'Total running avg',
+                      data: cumulativeAverage(total),
+                      color: '#ef4444',
+                      strokeWidth: 3,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yAxisLabel="Tokens / sec"
+                />
+              );
+            })()
+          ) : (
+            <Skeleton />
+          )}
+        </ChartCard>
+
+        <ChartCard title="Cumulative prompt token source breakdown">
+          {metrics ? (
+            <StackedAreaChart
+              sourceSeries={metrics.promptTokensBySource}
+              durationS={metrics.durationS}
+            />
+          ) : (
+            <Skeleton />
+          )}
+        </ChartCard>
+      </div>
+    </div>
+  );
+}
+
+function Skeleton() {
+  return <div className="h-[260px] rounded-md bg-muted/30 animate-pulse" />;
+}
+
+function Empty() {
+  return (
+    <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
+  );
+}
+
+// Re-export type for use by sub-components
+export type { TimeSeriesPoint, QueueDepthPoint };
diff --git a/packages/app/src/components/inference/agentic-point/distribution.tsx b/packages/app/src/components/inference/agentic-point/distribution.tsx
new file mode 100644
index 00000000..c9a563fe
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/distribution.tsx
@@ -0,0 +1,140 @@
+'use client';
+
+import { useMemo, useRef } from 'react';
+
+/**
+ * Bar histogram with vertical p50/p75/p90/p95 guide lines. Designed for the
+ * detail-page card — fills its container width via `viewBox` + 100% width.
+ */
+export function Distribution({
+  values,
+  unit,
+  height = 260,
+}: {
+  values: readonly number[];
+  unit: string;
+  height?: number;
+}) {
+  const W = 720;
+  const H = height;
+  const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
+
+  const svgParts = useMemo(() => {
+    if (values.length === 0) return { bars: '', guides: '', legend: '', axis: '', yTicks: '' };
+    const sorted = [...values].toSorted((a, b) => a - b);
+    const min = sorted[0]!;
+    const max = sorted.at(-1)!;
+    const range = Math.max(1e-9, max - min);
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+
+    // Sturges-ish, scaled with sample size, capped so bars stay visible.
+    const nBins = Math.min(50, Math.max(15, Math.ceil(Math.sqrt(values.length))));
+    const counts: number[] = Array.from({ length: nBins }, () => 0);
+    for (const v of values) {
+      const i = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins));
+      counts[i]!++;
+    }
+    const maxCount = Math.max(...counts, 1);
+    const xScale = (v: number) => PAD.left + ((v - min) / range) * innerW;
+    const barW = innerW / nBins;
+
+    const fmt = (n: number) =>
+      n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
+
+    const quantile = (q: number): number => {
+      const pos = (sorted.length - 1) * q;
+      const lo = Math.floor(pos);
+      const hi = Math.ceil(pos);
+      return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo);
+    };
+
+    const bars = counts
+      .map((c, i) => {
+        const h = (c / maxCount) * innerH;
+        const x = PAD.left + i * barW;
+        const y = PAD.top + (innerH - h);
+        return `<rect x="${x.toFixed(2)}" y="${y.toFixed(2)}" width="${Math.max(0, barW - 1).toFixed(2)}" height="${h.toFixed(2)}" fill="currentColor" opacity="0.55" />`;
+      })
+      .join('');
+
+    const GUIDES = [
+      { label: 'p50', q: 0.5, color: '#3b82f6' },
+      { label: 'p75', q: 0.75, color: '#22c55e' },
+      { label: 'p90', q: 0.9, color: '#f59e0b' },
+      { label: 'p95', q: 0.95, color: '#ef4444' },
+    ] as const;
+    const guides = GUIDES.map(({ q, color }) => {
+      const v = quantile(q);
+      const x = xScale(v);
+      return `<line x1="${x.toFixed(2)}" x2="${x.toFixed(2)}" y1="${PAD.top}" y2="${(PAD.top + innerH).toFixed(2)}" stroke="${color}" stroke-width="2" stroke-dasharray="5 3" opacity="0.95" />`;
+    }).join('');
+
+    // 4-tick x-axis: min, ~33%, ~66%, max
+    const xTickVals = [min, min + range / 3, min + (2 * range) / 3, max];
+    const axisY = PAD.top + innerH + 14;
+    const axisLine = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${(PAD.top + innerH).toFixed(2)}" y2="${(PAD.top + innerH).toFixed(2)}" stroke="currentColor" opacity="0.2" />`;
+    const xLabels = xTickVals
+      .map((v, i) => {
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return `<text x="${xScale(v).toFixed(2)}" y="${axisY}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmt(v)}</text>`;
+      })
+      .join('');
+    const axisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">value (${unit})</text>`;
+
+    // 5-tick y-axis
+    const yTickVals = Array.from({ length: 5 }, (_, i) => (maxCount * i) / 4);
+    const yScale = (c: number) => PAD.top + (1 - c / maxCount) * innerH;
+    const yTicks = yTickVals
+      .map((v) => {
+        const y = yScale(v);
+        return `<g><line x1="${PAD.left - 4}" x2="${PAD.left}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.4" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${fmt(v)}</text></g>`;
+      })
+      .join('');
+    const yAxisLabel = `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">count</text>`;
+
+    const chipY = H - 8;
+    const chipW = innerW / GUIDES.length;
+    const legend = GUIDES.map(({ label: ql, q, color }, i) => {
+      const v = quantile(q);
+      const x = PAD.left + i * chipW;
+      return `
+      <line x1="${(x + 2).toFixed(2)}" x2="${(x + 14).toFixed(2)}" y1="${chipY - 4}" y2="${chipY - 4}" stroke="${color}" stroke-width="2" stroke-dasharray="5 3" />
+      <text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${ql} ${fmt(v)}</text>`;
+    }).join('');
+
+    return {
+      bars,
+      guides,
+      legend,
+      axis: axisLine + xLabels + axisTitle + yAxisLabel,
+      yTicks,
+    };
+  }, [values, unit, H]);
+
+  const ref = useRef<HTMLDivElement | null>(null);
+
+  if (values.length === 0) {
+    return (
+      <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
+    );
+  }
+
+  return (
+    <div ref={ref} className="w-full">
+      <div className="mb-2 text-xs text-muted-foreground">
+        {values.length.toLocaleString()} requests · range {Math.round(Math.min(...values))}–
+        {Math.round(Math.max(...values))} {unit}
+      </div>
+      <svg
+        viewBox={`0 0 ${W} ${H}`}
+        preserveAspectRatio="xMidYMid meet"
+        className="w-full h-auto text-foreground"
+        dangerouslySetInnerHTML={{
+          __html:
+            svgParts.bars + svgParts.guides + svgParts.axis + svgParts.yTicks + svgParts.legend,
+        }}
+      />
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
new file mode 100644
index 00000000..776c8ba2
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
@@ -0,0 +1,118 @@
+'use client';
+
+import { useRouter } from 'next/navigation';
+import { ChevronLeft, ChevronRight } from 'lucide-react';
+
+import type { BenchmarkSibling, BenchmarkSku } from '@/hooks/api/use-benchmark-siblings';
+
+const HW_LABELS: Record<string, string> = {
+  b200: 'B200',
+  b300: 'B300',
+  gb200: 'GB200',
+  gb300: 'GB300',
+  h100: 'H100',
+  h200: 'H200',
+  mi300x: 'MI300X',
+  mi325x: 'MI325X',
+  mi355x: 'MI355X',
+};
+
+const MODEL_LABELS: Record<string, string> = {
+  dsr1: 'DeepSeek R1',
+  dsv4: 'DeepSeek V4 Pro',
+  glm5: 'GLM-5',
+  'glm5.1': 'GLM-5.1',
+  gptoss120b: 'gpt-oss 120B',
+  kimik2: 'Kimi K2',
+  'kimik2.5': 'Kimi K2.5',
+  'kimik2.6': 'Kimi K2.6',
+  llama70b: 'Llama 3.3 70B',
+  'minimaxm2.5': 'MiniMax M2.5',
+  'minimaxm2.7': 'MiniMax M2.7',
+  'qwen3.5': 'Qwen 3.5',
+};
+
+function hwLabel(hw: string) {
+  return HW_LABELS[hw] ?? hw.toUpperCase();
+}
+function modelLabel(m: string) {
+  return MODEL_LABELS[m] ?? m;
+}
+function frameworkLabel(fw: string) {
+  if (fw === 'vllm') return 'vLLM';
+  if (fw === 'sglang') return 'SGLang';
+  if (fw === 'trt') return 'TRT';
+  if (fw === 'mori-sglang') return 'Mori-SGLang';
+  if (fw.startsWith('dynamo-')) return `Dynamo ${fw.slice('dynamo-'.length).toUpperCase()}`;
+  return fw;
+}
+
+/** Short label for a sibling chip: parallelism + concurrency. */
+function chipLabel(s: BenchmarkSibling): string {
+  const parallel = s.disagg
+    ? `${s.num_prefill_gpu}P+${s.num_decode_gpu}D`
+    : `TP${s.decode_tp}${s.decode_ep > 1 ? `EP${s.decode_ep}` : ''}`;
+  const offload = s.offload_mode === 'on' ? ' • off=ON' : '';
+  return `${parallel} • c=${s.conc}${offload}`;
+}
+
+export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: BenchmarkSibling[] }) {
+  const router = useRouter();
+  const currentIdx = siblings.findIndex((s) => s.is_current);
+  const prev = currentIdx > 0 ? siblings[currentIdx - 1] : null;
+  const next =
+    currentIdx !== -1 && currentIdx < siblings.length - 1 ? siblings[currentIdx + 1] : null;
+
+  const skuLabel = `${hwLabel(sku.hardware)} · ${modelLabel(sku.model)} · ${sku.precision.toUpperCase()} · ${frameworkLabel(sku.framework)}`;
+
+  return (
+    <div className="border-b border-border/40 pb-4 mb-4">
+      <div className="flex items-baseline justify-between gap-3 mb-3">
+        <h1 className="text-2xl font-semibold text-foreground">{skuLabel}</h1>
+        <span className="text-xs text-muted-foreground">
+          {siblings.length} point{siblings.length === 1 ? '' : 's'} in this run · {sku.date}
+        </span>
+      </div>
+      <div className="flex items-center gap-2 flex-wrap">
+        <button
+          type="button"
+          disabled={!prev}
+          onClick={() => prev && router.push(`/inference/agentic/${prev.id}`)}
+          className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
+          aria-label="Previous point"
+        >
+          <ChevronLeft className="size-3.5" /> prev
+        </button>
+        <div className="flex items-center gap-1 flex-wrap">
+          {siblings.map((s) => {
+            const active = s.is_current;
+            return (
+              <button
+                key={s.id}
+                type="button"
+                onClick={() => !active && router.push(`/inference/agentic/${s.id}`)}
+                className={`px-2 py-1 rounded-md text-xs border transition-colors ${
+                  active
+                    ? 'border-primary bg-primary text-primary-foreground font-medium'
+                    : 'border-border/40 text-foreground hover:bg-accent'
+                } ${s.has_trace ? '' : 'opacity-60'}`}
+                title={s.has_trace ? undefined : 'No stored trace data'}
+              >
+                {chipLabel(s)}
+              </button>
+            );
+          })}
+        </div>
+        <button
+          type="button"
+          disabled={!next}
+          onClick={() => next && router.push(`/inference/agentic/${next.id}`)}
+          className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
+          aria-label="Next point"
+        >
+          next <ChevronRight className="size-3.5" />
+        </button>
+      </div>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
new file mode 100644
index 00000000..bc081b4e
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -0,0 +1,311 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics';
+
+interface Series {
+  name: string;
+  /** The line to draw (caller pre-smooths if desired). */
+  data: TimeSeriesPoint[];
+  /** Optional raw per-scrape values; rendered as low-opacity scatter behind the line. */
+  rawData?: TimeSeriesPoint[];
+  color: string;
+  /** Override default stroke width (1.8). Use higher values for emphasis lines. */
+  strokeWidth?: number;
+}
+
+interface TimeSeriesChartProps {
+  series: Series[];
+  durationS: number;
+  yMax?: number;
+  yFmt?: (v: number) => string;
+  yAxisLabel?: string;
+  height?: number;
+}
+
+/** Centered rolling average over `windowSize` samples. */
+export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): TimeSeriesPoint[] {
+  if (data.length === 0 || windowSize <= 1) return data;
+  const half = Math.floor(windowSize / 2);
+  const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+  for (let i = 0; i < data.length; i++) {
+    const start = Math.max(0, i - half);
+    const end = Math.min(data.length, i + half + 1);
+    let sum = 0;
+    let n = 0;
+    for (let j = start; j < end; j++) {
+      sum += data[j]!.value;
+      n++;
+    }
+    out[i] = { t: data[i]!.t, value: n > 0 ? sum / n : 0 };
+  }
+  return out;
+}
+
+/**
+ * Expanding-window cumulative mean from index 0..i. Useful for "running
+ * average over the entire run" lines (red overlay in the throughput chart).
+ */
+export function cumulativeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
+  if (data.length === 0) return data;
+  const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+  let sum = 0;
+  for (let i = 0; i < data.length; i++) {
+    sum += data[i]!.value;
+    out[i] = { t: data[i]!.t, value: sum / (i + 1) };
+  }
+  return out;
+}
+
+/** Pointwise sum of two arrays sharing the same t index. */
+export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] {
+  const n = Math.min(a.length, b.length);
+  const out: TimeSeriesPoint[] = Array.from({ length: n });
+  for (let i = 0; i < n; i++) {
+    out[i] = { t: a[i]!.t, value: a[i]!.value + b[i]!.value };
+  }
+  return out;
+}
+
+const fmtInt = (n: number) =>
+  n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
+
+const fmtSeconds = (s: number) => {
+  if (s < 60) return `${Math.round(s)}s`;
+  const m = Math.floor(s / 60);
+  const rem = Math.round(s % 60);
+  return `${m}m ${rem}s`;
+};
+
+export function TimeSeriesChart({
+  series,
+  durationS,
+  yMax: yMaxOpt,
+  yFmt = fmtInt,
+  yAxisLabel,
+  height = 260,
+}: TimeSeriesChartProps) {
+  const W = 720;
+  const H = height;
+  const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
+
+  const inner = useMemo(() => {
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+    const xMax = Math.max(durationS, 1);
+    const yMax = yMaxOpt ?? Math.max(1e-9, ...series.flatMap((s) => s.data.map((d) => d.value)));
+    const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
+    const yScale = (v: number) => PAD.top + (1 - v / yMax) * innerH;
+
+    const subsample = (arr: TimeSeriesPoint[]) => {
+      if (arr.length === 0) return arr;
+      const stride = Math.max(1, Math.floor(arr.length / innerW));
+      return stride > 1 ? arr.filter((_, i) => i % stride === 0) : arr;
+    };
+
+    // Layered render: raw scatter (back) → lines (front). Iterate twice so
+    // emphasis lines (high strokeWidth) draw over everything else.
+    const dotsLayer = series
+      .filter((s) => s.rawData && s.rawData.length > 0)
+      .map((s) =>
+        subsample(s.rawData!)
+          .map((d) => {
+            const x = xScale(d.t);
+            const y = yScale(d.value);
+            return `<circle cx="${x.toFixed(2)}" cy="${y.toFixed(2)}" r="1.5" fill="${s.color}" opacity="0.2" />`;
+          })
+          .join(''),
+      )
+      .join('');
+
+    const lineLayer = series
+      .map((s) => {
+        if (s.data.length === 0) return '';
+        const sampled = subsample(s.data);
+        const pts = sampled.map((d) => [xScale(d.t), yScale(d.value)] as [number, number]);
+        const path = pts
+          .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
+          .join(' ');
+        return `<path d="${path}" fill="none" stroke="${s.color}" stroke-width="${s.strokeWidth ?? 1.8}" />`;
+      })
+      .join('');
+
+    const paths = dotsLayer + lineLayer;
+
+    // X-axis: 5 ticks at 0..xMax
+    const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+    const axisY = PAD.top + innerH;
+    const xAxis = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${axisY.toFixed(2)}" y2="${axisY.toFixed(2)}" stroke="currentColor" opacity="0.2" />${xTickVals
+      .map((v, i) => {
+        const x = xScale(v);
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return `<text x="${x.toFixed(2)}" y="${(axisY + 14).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmtSeconds(v)}</text>`;
+      })
+      .join('')}`;
+    const xAxisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">time</text>`;
+
+    // Y-axis: 5 ticks at 0..yMax
+    const yTickVals = Array.from({ length: 5 }, (_, i) => (yMax * i) / 4);
+    const yTicks = yTickVals
+      .map((v) => {
+        const y = yScale(v);
+        return `<g><line x1="${PAD.left - 4}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.08" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${yFmt(v)}</text></g>`;
+      })
+      .join('');
+    const yAxisTitle = yAxisLabel
+      ? `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">${yAxisLabel}</text>`
+      : '';
+
+    // Legend at the bottom of the SVG
+    const chipY = H - 8;
+    const chipW = innerW / Math.max(1, series.length);
+    const legend = series
+      .map((s, i) => {
+        const x = PAD.left + i * chipW;
+        return `<line x1="${(x + 2).toFixed(2)}" x2="${(x + 14).toFixed(2)}" y1="${chipY - 4}" y2="${chipY - 4}" stroke="${s.color}" stroke-width="${s.strokeWidth ?? 2}" /><text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${s.name}</text>`;
+      })
+      .join('');
+
+    return paths + xAxis + xAxisTitle + yTicks + yAxisTitle + legend;
+  }, [series, durationS, yMaxOpt, yFmt, yAxisLabel, H]);
+
+  if (series.every((s) => s.data.length === 0)) {
+    return (
+      <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
+    );
+  }
+
+  return (
+    <svg
+      viewBox={`0 0 ${W} ${H}`}
+      preserveAspectRatio="xMidYMid meet"
+      className="w-full h-auto text-foreground"
+      dangerouslySetInnerHTML={{ __html: inner }}
+    />
+  );
+}
+
+/** Stacked-area chart for token-source share over time. */
+export function StackedAreaChart({
+  sourceSeries,
+  durationS,
+  height = 260,
+}: {
+  sourceSeries: Record<string, TimeSeriesPoint[]>;
+  durationS: number;
+  height?: number;
+}) {
+  const W = 720;
+  const H = height;
+  const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
+
+  const inner = useMemo(() => {
+    const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0);
+    if (entries.length === 0) return '';
+    const tValues = entries[0]![1].map((p) => p.t);
+    const cum: Record<string, number[]> = {};
+    for (const [name, arr] of entries) {
+      let acc = 0;
+      cum[name] = arr.map((p) => {
+        acc += p.value;
+        return acc;
+      });
+    }
+    const shares: Record<string, number[]> = {};
+    for (const name of Object.keys(cum)) shares[name] = [];
+    for (let i = 0; i < tValues.length; i++) {
+      const total = entries.reduce((s, [name]) => s + (cum[name]?.[i] ?? 0), 0);
+      for (const [name] of entries) {
+        shares[name]!.push(total > 0 ? (cum[name]?.[i] ?? 0) / total : 0);
+      }
+    }
+
+    const colors: Record<string, string> = {
+      local_compute: '#f97316',
+      local_cache_hit: '#3b82f6',
+      external_kv_transfer: '#22c55e',
+      miss: '#f97316',
+    };
+    const labelFor: Record<string, string> = {
+      local_compute: 'Prefill',
+      local_cache_hit: 'HBM Cache Hit',
+      external_kv_transfer: 'Offload Cache Hit',
+      miss: 'Miss',
+    };
+
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+    const xMax = Math.max(durationS, 1);
+    const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
+    const yScale = (v: number) => PAD.top + (1 - v) * innerH;
+
+    const stackOrder = Object.keys(shares);
+    const lower: number[] = Array.from({ length: tValues.length }, () => 0);
+    const layers = stackOrder.map((name) => {
+      const upper = shares[name]!.map((v, i) => lower[i]! + v);
+      const top = upper.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+      const bottom = lower.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+      const d = `${top
+        .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
+        .join(' ')} ${[...bottom]
+        .toReversed()
+        .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`)
+        .join(' ')} Z`;
+      const color = colors[name] ?? '#6b7280';
+      const path = `<path d="${d}" fill="${color}" opacity="0.75" />`;
+      for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!;
+      return { name, color, path };
+    });
+
+    const paths = layers.map((l) => l.path).join('');
+
+    // X-axis
+    const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+    const axisY = PAD.top + innerH;
+    const xAxis = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${axisY.toFixed(2)}" y2="${axisY.toFixed(2)}" stroke="currentColor" opacity="0.2" />${xTickVals
+      .map((v, i) => {
+        const x = xScale(v);
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return `<text x="${x.toFixed(2)}" y="${(axisY + 14).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmtSeconds(v)}</text>`;
+      })
+      .join('')}`;
+    const xAxisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">time</text>`;
+
+    // Y-axis 0..100%
+    const yTickVals = [0, 0.25, 0.5, 0.75, 1];
+    const yTicks = yTickVals
+      .map((v) => {
+        const y = yScale(v);
+        return `<g><line x1="${PAD.left - 4}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.08" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${(v * 100).toFixed(0)}%</text></g>`;
+      })
+      .join('');
+    const yAxisTitle = `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">% of prefill tokens</text>`;
+
+    const chipY = H - 8;
+    const chipW = innerW / Math.max(1, layers.length);
+    const legend = layers
+      .map((l, i) => {
+        const x = PAD.left + i * chipW;
+        return `<rect x="${(x + 2).toFixed(2)}" y="${chipY - 9}" width="12" height="8" fill="${l.color}" opacity="0.75" /><text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${labelFor[l.name] ?? l.name}</text>`;
+      })
+      .join('');
+
+    return paths + xAxis + xAxisTitle + yTicks + yAxisTitle + legend;
+  }, [sourceSeries, durationS, H]);
+
+  if (Object.values(sourceSeries).every((v) => v.length === 0)) {
+    return (
+      <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
+    );
+  }
+
+  return (
+    <svg
+      viewBox={`0 0 ${W} ${H}`}
+      preserveAspectRatio="xMidYMid meet"
+      className="w-full h-auto text-foreground"
+      dangerouslySetInnerHTML={{ __html: inner }}
+    />
+  );
+}
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index f848e0e4..7a39bbd1 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -36,6 +36,8 @@ import type { Model, Sequence } from '@/lib/data-mappings';
  * @property {number} p99_e2el - 99th percentile of End-to-End Latency.
  */
 export interface AggDataEntry {
+  /** Stable per-point id from benchmark_results — for trace_replay lookups. */
+  id?: number;
   hw: string;
   mtp?: string;
   hwKey: string;
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 98562fb9..fdcf8952 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -6,6 +6,8 @@ import React, { useCallback, useEffect, useMemo, useRef } from 'react';
 
 import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry';
 import { useInference } from '@/components/inference/InferenceContext';
+import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
+import { useRouter } from 'next/navigation';
 import ChartLegend from '@/components/ui/chart-legend';
 import { useUnofficialRun } from '@/components/unofficial-run-provider';
 import { computeToggle } from '@/hooks/useTogglableSet';
@@ -348,6 +350,10 @@ const ScatterGraph = React.memo(
     );
 
     const rooflines = useMemo(() => {
+      // Frontier scope is (hw, precision, date) — points from different dates
+      // can never share a frontier (a May 15 point can't dominate a May 17 plot).
+      // The legend grouping is still by (hw, precision); we just split the
+      // pareto compute per date and re-merge into the legend bucket.
       const result: Record<string, InferenceData[]> = {};
       const rooflineKey = `${selectedYAxisMetric}_roofline` as keyof ChartDefinition;
       const dir = chartDefinition[rooflineKey] as
@@ -356,17 +362,31 @@ const ScatterGraph = React.memo(
         | 'lower_left'
         | 'lower_right'
         | undefined;
-      for (const hw of Object.keys(groupedData)) {
-        const front =
-          dir === 'upper_right'
-            ? paretoFrontUpperRight(groupedData[hw])
-            : dir === 'upper_left'
-              ? paretoFrontUpperLeft(groupedData[hw])
-              : dir === 'lower_left'
-                ? paretoFrontLowerLeft(groupedData[hw])
-                : paretoFrontLowerRight(groupedData[hw]);
-        front.sort((a, b) => a.x - b.x);
-        result[hw] = front;
+      const frontierFn =
+        dir === 'upper_right'
+          ? paretoFrontUpperRight
+          : dir === 'upper_left'
+            ? paretoFrontUpperLeft
+            : dir === 'lower_left'
+              ? paretoFrontLowerLeft
+              : paretoFrontLowerRight;
+      for (const hwKey of Object.keys(groupedData)) {
+        const byDate = new Map<string, InferenceData[]>();
+        for (const p of groupedData[hwKey]) {
+          const d = p.date;
+          let bucket = byDate.get(d);
+          if (!bucket) {
+            bucket = [];
+            byDate.set(d, bucket);
+          }
+          bucket.push(p);
+        }
+        const combined: InferenceData[] = [];
+        for (const datePoints of byDate.values()) {
+          combined.push(...frontierFn(datePoints));
+        }
+        combined.sort((a, b) => a.x - b.x);
+        result[hwKey] = combined;
       }
       return result;
     }, [groupedData, selectedYAxisMetric, chartDefinition]);
@@ -374,7 +394,7 @@ const ScatterGraph = React.memo(
     const optimalPointKeys = useMemo(() => {
       const keys = new Set<string>();
       Object.values(rooflines).forEach((pts) =>
-        pts.forEach((p) => keys.add(`${p.hwKey}_${p.precision}-${p.x}-${p.y}`)),
+        pts.forEach((p) => keys.add(`${p.hwKey}_${p.precision}_${p.date}-${p.x}-${p.y}`)),
       );
       return keys;
     }, [rooflines]);
@@ -477,6 +497,18 @@ const ScatterGraph = React.memo(
     // All official points for rendering (unfiltered — visibility via opacity)
     const pointsData = useMemo(() => Object.values(groupedData).flat(), [groupedData]);
 
+    // Trace-replay histograms (ISL / OSL distributions) for agentic points.
+    // Pre-fetch the whole visible set so tooltip render stays synchronous.
+    const agenticIds = useMemo(() => {
+      const ids: number[] = [];
+      for (const p of pointsData) {
+        if (p.benchmark_type === 'agentic_traces' && typeof p.id === 'number') ids.push(p.id);
+      }
+      return ids;
+    }, [pointsData]);
+    const { data: traceHistograms } = useTraceHistograms(agenticIds);
+    const router = useRouter();
+
     // Gradient label data
     const allPointLabelsByKey = useMemo(() => {
       const globalLabelColorMap = new Map<string, string>();
@@ -516,7 +548,9 @@ const ScatterGraph = React.memo(
     const visiblePoints = useMemo(() => {
       let pts = filteredData;
       if (hideNonOptimal) {
-        pts = pts.filter((d) => optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`));
+        pts = pts.filter((d) =>
+          optimalPointKeys.has(`${d.hwKey}_${d.precision}_${d.date}-${d.x}-${d.y}`),
+        );
       }
       return processedOverlayData.length > 0 ? [...pts, ...processedOverlayData] : pts;
     }, [filteredData, processedOverlayData, hideNonOptimal, optimalPointKeys]);
@@ -601,7 +635,8 @@ const ScatterGraph = React.memo(
       (d: InferenceData) =>
         effectiveActiveHwTypes.has(d.hwKey as string) &&
         selectedPrecisions.includes(d.precision) &&
-        (!hideNonOptimal || optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`)),
+        (!hideNonOptimal ||
+          optimalPointKeys.has(`${d.hwKey}_${d.precision}_${d.date}-${d.x}-${d.y}`)),
       [effectiveActiveHwTypes, selectedPrecisions, hideNonOptimal, optimalPointKeys],
     );
 
@@ -739,6 +774,8 @@ const ScatterGraph = React.memo(
             hardwareConfig,
             isTracked: trackedConfigIdsRef.current.has(buildPointConfigId(d)),
             runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined,
+            traceHistogram:
+              typeof d.id === 'number' ? (traceHistograms?.[d.id] ?? undefined) : undefined,
           }),
         getRulerX: (d: InferenceData, xScale: any) => (xScale as ContinuousScale)(d.x),
         getRulerY: (d: InferenceData, yScale: any) => (yScale as ContinuousScale)(d.y),
@@ -754,26 +791,43 @@ const ScatterGraph = React.memo(
           ),
         onPointClick: (d: InferenceData) => {
           track('latency_data_point_clicked', { hw: String(d.hwKey), x: d.x, y: d.y });
-          // Attach track-over-time button handler in the tooltip
           const tooltipEl = chartRef.current?.getTooltipElement();
-          if (tooltipEl) {
-            const btn = tooltipEl.querySelector('[data-action="track-over-time"]');
-            if (btn) {
-              btn.addEventListener('click', (btnEvent) => {
-                btnEvent.stopPropagation();
-                const configId = buildPointConfigId(d);
-                if (trackedConfigIdsRef.current.has(configId)) removeTrackedConfig(configId);
-                else addTrackedConfig(d, chartDefinition.chartType);
-                chartRef.current?.dismissTooltip();
-                chartRef.current?.hideTooltip();
-                track('latency_point_tracked_via_tooltip', {
-                  hwKey: String(d.hwKey),
-                  tp: d.tp,
-                  conc: d.conc,
-                  precision: d.precision,
-                });
+          if (!tooltipEl) return;
+
+          // ── Summary-page actions ──────────────────────────────────────────
+          const trackBtn = tooltipEl.querySelector('[data-action="track-over-time"]');
+          if (trackBtn) {
+            trackBtn.addEventListener('click', (btnEvent) => {
+              btnEvent.stopPropagation();
+              const configId = buildPointConfigId(d);
+              if (trackedConfigIdsRef.current.has(configId)) removeTrackedConfig(configId);
+              else addTrackedConfig(d, chartDefinition.chartType);
+              chartRef.current?.dismissTooltip();
+              chartRef.current?.hideTooltip();
+              track('latency_point_tracked_via_tooltip', {
+                hwKey: String(d.hwKey),
+                tp: d.tp,
+                conc: d.conc,
+                precision: d.precision,
               });
-            }
+            });
+          }
+
+          // ── "View charts" → navigate to dedicated detail page ────────────
+          const viewBtn = tooltipEl.querySelector('[data-action="view-charts"]');
+          if (viewBtn && typeof d.id === 'number') {
+            const pointId = d.id;
+            viewBtn.addEventListener('click', (btnEvent) => {
+              btnEvent.stopPropagation();
+              track('latency_view_charts_opened', {
+                id: pointId,
+                hwKey: String(d.hwKey),
+                conc: d.conc,
+              });
+              chartRef.current?.dismissTooltip();
+              chartRef.current?.hideTooltip();
+              router.push(`/inference/agentic/${pointId}`);
+            });
           }
         },
         attachToLayer: 1, // scatter layer is index 1 (after rooflines at 0)
@@ -788,6 +842,11 @@ const ScatterGraph = React.memo(
         removeTrackedConfig,
         chartDefinition.chartType,
         selectedPrecisions,
+        // Tooltip content closure reads traceHistograms to decide whether to
+        // show the "View charts" button — rebuild config when the histogram
+        // fetch resolves so the button appears for points that have data.
+        traceHistograms,
+        router,
       ],
     );
 
@@ -838,35 +897,64 @@ const ScatterGraph = React.memo(
             const precision = key.split('_').pop()!;
             const visible =
               effectiveActiveHwTypes.has(hw) && selectedPrecisions.includes(precision);
-            let stroke = getCssColor(resolveColor(hw));
-
-            if (showGradientLabels) {
-              const pointLabels = allPointLabelsByKey[key];
-              if (pointLabels) {
-                const stops = computeGradientStops(pointLabels, xScale);
-                if (stops) {
-                  const gid = `roofline-gradient-${chartId}-${key}`;
-                  activeGradientIds.add(gid);
-                  let gradient = defs.select<SVGLinearGradientElement>(`#${CSS.escape(gid)}`);
-                  if (gradient.empty()) gradient = defs.append('linearGradient').attr('id', gid);
-                  gradient
-                    .attr('gradientUnits', 'userSpaceOnUse')
-                    .attr('x1', xScale(pts[0].x))
-                    .attr('y1', 0)
-                    .attr('x2', xScale(pts.at(-1)!.x))
-                    .attr('y2', 0);
-                  gradient
-                    .selectAll('stop')
-                    .data(stops)
-                    .join('stop')
-                    .attr('offset', (s) => `${(s.offset * 100).toFixed(2)}%`)
-                    .attr('stop-color', (s) => s.color);
-                  stroke = `url(#${gid})`;
-                }
+            const baseStroke = getCssColor(resolveColor(hw));
+
+            // Split into per-date sub-paths so the line never crosses dates.
+            // (When only one date is present the loop runs once with the full set.)
+            const byDate = new Map<string, InferenceData[]>();
+            for (const p of pts) {
+              let bucket = byDate.get(p.date);
+              if (!bucket) {
+                bucket = [];
+                byDate.set(p.date, bucket);
               }
+              bucket.push(p);
             }
+            const singleDate = byDate.size === 1;
+
+            for (const [date, datePoints] of byDate) {
+              if (datePoints.length <= 1) continue;
+              const entryKey = singleDate ? key : `${key}__${date}`;
+              let stroke = baseStroke;
+
+              // Gradient labels only apply in the single-date case; mapping the
+              // (key-wide) ParetoPointLabel array onto per-date sub-segments is
+              // ambiguous and the comparison-date overlay is a rare combo.
+              if (singleDate && showGradientLabels) {
+                const pointLabels = allPointLabelsByKey[key];
+                if (pointLabels) {
+                  const stops = computeGradientStops(pointLabels, xScale);
+                  if (stops) {
+                    const gid = `roofline-gradient-${chartId}-${entryKey}`;
+                    activeGradientIds.add(gid);
+                    let gradient = defs.select<SVGLinearGradientElement>(`#${CSS.escape(gid)}`);
+                    if (gradient.empty()) gradient = defs.append('linearGradient').attr('id', gid);
+                    gradient
+                      .attr('gradientUnits', 'userSpaceOnUse')
+                      .attr('x1', xScale(datePoints[0].x))
+                      .attr('y1', 0)
+                      .attr('x2', xScale(datePoints.at(-1)!.x))
+                      .attr('y2', 0);
+                    gradient
+                      .selectAll('stop')
+                      .data(stops)
+                      .join('stop')
+                      .attr('offset', (s) => `${(s.offset * 100).toFixed(2)}%`)
+                      .attr('stop-color', (s) => s.color);
+                    stroke = `url(#${gid})`;
+                  }
+                }
+              }
 
-            entries.push({ key, hw, precision, points: pts, stroke, visible });
+              entries.push({
+                key: entryKey,
+                hw,
+                precision,
+                points: datePoints,
+                stroke,
+                visible,
+              });
+            }
           });
 
           // Remove stale gradients
@@ -1271,11 +1359,26 @@ const ScatterGraph = React.memo(
             .y((d) => newYScale(d.y))
             .curve(d3.curveMonotoneX);
 
-          // Update roofline paths
+          // Update roofline paths — must split per-date so the zoom redraw
+          // matches the per-date sub-paths created in the initial render.
           Object.entries(rooflines).forEach(([key, pts]) => {
             if (pts.length < 2) return;
-            const sel = zoomGroup.select<SVGPathElement>(`.roofline-${key}`);
-            if (!sel.empty()) sel.attr('d', lineGen(pts) as string);
+            const byDate = new Map<string, InferenceData[]>();
+            for (const p of pts) {
+              let bucket = byDate.get(p.date);
+              if (!bucket) {
+                bucket = [];
+                byDate.set(p.date, bucket);
+              }
+              bucket.push(p);
+            }
+            const singleDate = byDate.size === 1;
+            for (const [date, datePoints] of byDate) {
+              if (datePoints.length < 2) continue;
+              const cls = singleDate ? `roofline-${key}` : `roofline-${key}__${date}`;
+              const sel = zoomGroup.select<SVGPathElement>(`.${CSS.escape(cls)}`);
+              if (!sel.empty()) sel.attr('d', lineGen(datePoints) as string);
+            }
           });
 
           // Update gradient coordinates
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index 3154070a..ccc371f9 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -19,6 +19,13 @@ export interface TooltipConfig {
   isTracked?: boolean;
   /** URL to the GitHub Actions workflow run */
   runUrl?: string;
+  /**
+   * Per-request ISL/OSL arrays for agentic points, sourced from the stored
+   * aiperf `profile_export.jsonl`. Used to detect whether the point has any
+   * trace data (so the "View charts" button can appear); the actual
+   * distributions are rendered on the detail page, not inline.
+   */
+  traceHistogram?: { isl: number[]; osl: number[] } | undefined;
 }
 
 export interface OverlayTooltipConfig extends TooltipConfig {
@@ -138,9 +145,24 @@ const generateAgenticHTML = (d: InferenceData): string => {
     parts.push(tooltipLine('Generated Tokens', formatNumber(d.total_generation_tokens)));
   }
 
+  // Histograms + time-series live on the dedicated detail page now; the
+  // "View charts" button (rendered by the wrapper when pinned + has trace
+  // data) takes the user there.
+
   return parts.join('');
 };
 
+/** "View charts" button — only visible when the tooltip is pinned and the
+ *  point has stored trace data. Wired up by the ScatterGraph click handler. */
+const viewChartsButtonHTML = (isPinned: boolean, hasTraceData: boolean): string => {
+  if (!isPinned || !hasTraceData) return '';
+  return `<button data-action="view-charts" style="
+    margin-top: 8px; width: 100%; padding: 4px 8px; font-size: 11px; font-weight: 500;
+    border: 1px solid var(--border); border-radius: 6px; cursor: pointer;
+    background: var(--accent); color: var(--accent-foreground);
+  ">View charts &rarr;</button>`;
+};
+
 const shortenSha = (image: string) => image.replaceAll(/(sha256:[a-f0-9]{7})[a-f0-9]+/giu, '$1…');
 
 const imageTooltipLine = (image: string) =>
@@ -191,7 +213,16 @@ const generateParallelismHTML = (d: InferenceData): string => {
  * @returns HTML string for the tooltip content
  */
 export const generateTooltipContent = (config: TooltipConfig): string => {
-  const { data: d, isPinned, xLabel, yLabel, selectedYAxisMetric, hardwareConfig, runUrl } = config;
+  const {
+    data: d,
+    isPinned,
+    xLabel,
+    yLabel,
+    selectedYAxisMetric,
+    hardwareConfig,
+    runUrl,
+    traceHistogram,
+  } = config;
 
   return `
     <div style="background: var(--popover); border: 1px solid var(--border); border-radius: 8px; padding: 12px; box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1); user-select: ${isPinned ? 'text' : 'none'};">
@@ -240,6 +271,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
       </div>
       ${generateAgenticHTML(d)}
       ${runLinkHTML(runUrl)}
+      ${viewChartsButtonHTML(isPinned, Boolean(traceHistogram))}
       ${
         isPinned
           ? `<button data-action="track-over-time" style="
diff --git a/packages/app/src/components/ui/d3-chart-wrapper.tsx b/packages/app/src/components/ui/d3-chart-wrapper.tsx
index 0392ac10..44013b1b 100644
--- a/packages/app/src/components/ui/d3-chart-wrapper.tsx
+++ b/packages/app/src/components/ui/d3-chart-wrapper.tsx
@@ -1,6 +1,41 @@
 'use client';
 
-import React from 'react';
+import React, { useEffect, useState } from 'react';
+import { createPortal } from 'react-dom';
+
+/**
+ * Renders the d3 tooltip element via React Portal to document.body so it
+ * escapes any parent stacking context (e.g. the chart Card's backdrop-filter
+ * creates one, trapping z-index inside it). Position is set as viewport
+ * coordinates by the d3 layer.
+ */
+function PortalTooltip({
+  tooltipRef,
+  pinned,
+}: {
+  tooltipRef: React.RefObject<HTMLDivElement | null>;
+  pinned: boolean;
+}) {
+  const [mounted, setMounted] = useState(false);
+  useEffect(() => setMounted(true), []);
+  const node = (
+    <div
+      ref={tooltipRef}
+      data-chart-tooltip
+      style={{
+        position: 'fixed',
+        left: 0,
+        top: 0,
+        opacity: pinned ? 1 : 0,
+        pointerEvents: pinned ? 'auto' : 'none',
+        display: pinned ? 'block' : 'none',
+        zIndex: 9999,
+      }}
+    />
+  );
+  if (!mounted || typeof document === 'undefined') return node;
+  return createPortal(node, document.body);
+}
 
 export interface D3ChartWrapperProps {
   chartId: string;
@@ -72,17 +107,11 @@ export function D3ChartWrapper({
                 }
               }}
             />
-            <div
-              ref={tooltipRef}
-              data-chart-tooltip
-              style={{
-                position: 'absolute',
-                opacity: pinnedPoint ? 1 : 0,
-                pointerEvents: pinnedPoint ? 'auto' : 'none',
-                display: pinnedPoint ? 'block' : 'none',
-                zIndex: 50,
-              }}
-            />
+            {/* Tooltip is portalled to <body> with position:fixed so it can
+                rise above sibling chart cards' stacking contexts. The d3 layer
+                writes viewport-coords into style.left/top — see
+                computeTooltipPosition. */}
+            <PortalTooltip tooltipRef={tooltipRef} pinned={Boolean(pinnedPoint)} />
             {noDataOverlay}
           </div>
           <p className="no-export text-xs text-muted-foreground text-center mt-2">{instructions}</p>
diff --git a/packages/app/src/components/unofficial-run-provider.test.ts b/packages/app/src/components/unofficial-run-provider.test.ts
index aa0f6c43..3c24d32b 100644
--- a/packages/app/src/components/unofficial-run-provider.test.ts
+++ b/packages/app/src/components/unofficial-run-provider.test.ts
@@ -12,6 +12,7 @@ import { buildChartData, parseAvailableModelsAndSequences } from './unofficial-r
 /** Minimal BenchmarkRow stub — only fields used by buildChartData key logic. */
 function stubRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
   return {
+    id: 1,
     hardware: 'h200',
     framework: 'sglang',
     model: 'dsr1',
diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts
new file mode 100644
index 00000000..1ea90c0d
--- /dev/null
+++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts
@@ -0,0 +1,46 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface BenchmarkSibling {
+  id: number;
+  conc: number;
+  offload_mode: string | null;
+  decode_tp: number;
+  decode_ep: number;
+  prefill_tp: number;
+  prefill_ep: number;
+  num_prefill_gpu: number;
+  num_decode_gpu: number;
+  disagg: boolean;
+  is_current: boolean;
+  has_trace: boolean;
+}
+
+export interface BenchmarkSku {
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  benchmark_type: string;
+  github_run_id: number;
+  date: string;
+}
+
+export interface BenchmarkSiblings {
+  sku: BenchmarkSku;
+  siblings: BenchmarkSibling[];
+}
+
+export function useBenchmarkSiblings(id: number | null) {
+  return useQuery({
+    queryKey: ['benchmark-siblings', id] as const,
+    queryFn: async ({ signal }) => {
+      const res = await fetch(`/api/v1/benchmark-siblings?id=${id}`, { signal });
+      if (res.status === 404) return null;
+      if (!res.ok) throw new Error(`benchmark-siblings ${res.status}`);
+      return (await res.json()) as BenchmarkSiblings;
+    },
+    enabled: id !== null && id > 0,
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/app/src/hooks/api/use-trace-histograms.ts b/packages/app/src/hooks/api/use-trace-histograms.ts
new file mode 100644
index 00000000..db4220d2
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-histograms.ts
@@ -0,0 +1,39 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface TraceHistogramPoint {
+  id: number;
+  /** Input sequence length (tokens) per completed request. */
+  isl: number[];
+  /** Output sequence length (tokens) per completed request. */
+  osl: number[];
+}
+
+export type TraceHistogramMap = Record<number, TraceHistogramPoint>;
+
+async function fetchTraceHistograms(
+  ids: number[],
+  signal?: AbortSignal,
+): Promise<TraceHistogramMap> {
+  if (ids.length === 0) return {};
+  const res = await fetch(`/api/v1/trace-histograms?ids=${ids.join(',')}`, { signal });
+  if (!res.ok) throw new Error(`trace-histograms ${res.status}`);
+  return (await res.json()) as TraceHistogramMap;
+}
+
+/**
+ * Fetch per-request ISL/OSL arrays for a set of benchmark_results.id values.
+ * Ids without a stored trace_replay blob are silently omitted from the response.
+ *
+ * Caller passes the agentic id set currently on screen; React Query handles
+ * dedup + stale-while-revalidate. Cache key is sorted-ids-comma-joined so
+ * any permutation of the same set hits the same cache entry.
+ */
+export function useTraceHistograms(ids: number[], enabled = true) {
+  const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+  return useQuery({
+    queryKey: ['trace-histograms', sortedKey.join(',')] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) => fetchTraceHistograms(sortedKey, signal),
+    enabled: enabled && sortedKey.length > 0,
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
new file mode 100644
index 00000000..8418aa4f
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -0,0 +1,70 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface TimeSeriesPoint {
+  /** Seconds from benchmark start. */
+  t: number;
+  value: number;
+}
+export interface QueueDepthPoint {
+  t: number;
+  running: number;
+  waiting: number;
+  total: number;
+}
+export interface PointMeta {
+  id: number;
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  disagg: boolean;
+  conc: number;
+  offload_mode: string | null;
+  isl: number | null;
+  osl: number | null;
+  benchmark_type: string;
+  date: string;
+  run_url: string | null;
+  server_gpu_cache_hit_rate: number | null;
+  server_cpu_cache_hit_rate: number | null;
+}
+
+export interface TraceServerMetrics {
+  meta: PointMeta;
+  startNs: number;
+  endNs: number;
+  durationS: number;
+  timeslicesCount: number;
+  kvCacheUsage: TimeSeriesPoint[];
+  prefixCacheHitRate: TimeSeriesPoint[];
+  queueDepth: QueueDepthPoint[];
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  prefillTps: TimeSeriesPoint[];
+  decodeTps: TimeSeriesPoint[];
+}
+
+async function fetchTraceServerMetrics(
+  id: number,
+  signal?: AbortSignal,
+): Promise<TraceServerMetrics | null> {
+  const res = await fetch(`/api/v1/trace-server-metrics?id=${id}`, { signal });
+  if (res.status === 404) return null;
+  if (!res.ok) throw new Error(`trace-server-metrics ${res.status}`);
+  return (await res.json()) as TraceServerMetrics;
+}
+
+/**
+ * Lazy-fetch parsed server-metric time-series for one agentic point.
+ * Enabled only when the caller passes `enabled=true` (the detail panel opens),
+ * so we don't pay the parse cost on every hover.
+ */
+export function useTraceServerMetrics(id: number | null, enabled = false) {
+  return useQuery({
+    queryKey: ['trace-server-metrics', id] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) =>
+      id ? fetchTraceServerMetrics(id, signal) : Promise.resolve(null),
+    enabled: enabled && Boolean(id),
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
index 435f7629..98587c2f 100644
--- a/packages/app/src/lib/api.ts
+++ b/packages/app/src/lib/api.ts
@@ -6,6 +6,8 @@
 import type { SubmissionsResponse } from './submissions-types';
 
 export interface BenchmarkRow {
+  /** Stable per-point id from benchmark_results; used to look up trace histograms. */
+  id: number;
   hardware: string;
   framework: string;
   model: string;
diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts
index 6a6c97c8..fcbca681 100644
--- a/packages/app/src/lib/benchmark-transform.test.ts
+++ b/packages/app/src/lib/benchmark-transform.test.ts
@@ -6,6 +6,7 @@ import { rowToAggDataEntry, transformBenchmarkRows } from './benchmark-transform
 
 function makeRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
   return {
+    id: 1,
     hardware: 'h200',
     framework: 'trt',
     model: 'dsr1',
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index eb62a18a..c5bdd6ed 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -49,6 +49,8 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
     row.offload_mode ??
     (typeof rawMetrics.offload_mode === 'string' ? rawMetrics.offload_mode : undefined);
   return {
+    // Coerce: Postgres bigint comes through the SQL client as a string.
+    id: typeof row.id === 'number' ? row.id : Number(row.id),
     hw: row.hardware,
     framework: row.framework,
     model: DB_MODEL_TO_DISPLAY[row.model] ?? row.model,
diff --git a/packages/app/src/lib/compare-pair-defaults.test.ts b/packages/app/src/lib/compare-pair-defaults.test.ts
index 3b49dfbc..da81ca0e 100644
--- a/packages/app/src/lib/compare-pair-defaults.test.ts
+++ b/packages/app/src/lib/compare-pair-defaults.test.ts
@@ -6,6 +6,7 @@ import { pickPairDefaults } from './compare-pair-defaults';
 
 function makeRow(overrides: Partial<BenchmarkRow>): BenchmarkRow {
   return {
+    id: 1,
     hardware: 'h100',
     framework: 'sglang',
     model: 'dsr1',
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index 4fa19fe8..421ac69b 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -289,7 +289,21 @@ export function attachScatterTooltipHandlers<
     });
 }
 
-/** Compute tooltip left/top, flipping when it would overflow the chart container. */
+/**
+ * Compute tooltip left/top **in viewport coordinates** so the tooltip can be
+ * rendered via portal with `position: fixed`. Callers still pass cursor coords
+ * relative to `container` (matching `d3.pointer(event, container)`).
+ *
+ * Why viewport coords: the chart cards use `backdrop-filter`, which creates
+ * a stacking context. A tooltip painted inside the upper card's stacking
+ * context cannot rise above the lower card's stacking context regardless of
+ * its z-index. Portalling to document.body + `position: fixed` sidesteps the
+ * whole problem; we just need the coordinates in viewport space.
+ *
+ * Strategy: pick preferred side (right/below cursor), flip if it overflows the
+ * container, then clamp to container bounds. Tall tooltips that don't fit get
+ * clamped to the container edges.
+ */
 export function computeTooltipPosition(
   mx: number,
   my: number,
@@ -308,13 +322,21 @@ export function computeTooltipPosition(
   // Force reflow so we get real dimensions
   const tw = node.getBoundingClientRect().width || node.offsetWidth;
   const th = node.getBoundingClientRect().height || node.offsetHeight;
+  const rect = container.getBoundingClientRect();
   const cw = container.clientWidth;
   const ch = container.clientHeight;
+  const EDGE_PAD = 4;
+
+  // Prefer right of cursor; flip to left if no room.
+  let left = mx + offset + tw <= cw ? mx + offset : mx - offset - tw;
+  left = Math.max(EDGE_PAD, Math.min(cw - tw - EDGE_PAD, left));
 
-  const left = mx + offset + tw > cw ? mx - offset - tw : mx + offset;
-  const top = my + offset + th > ch ? my - offset - th : my + offset;
+  // Prefer below cursor; flip above if no room.
+  let top = my + offset + th <= ch ? my + offset : my - offset - th;
+  top = Math.max(EDGE_PAD, Math.min(ch - th - EDGE_PAD, top));
 
-  return { left, top };
+  // Convert container-local coords → viewport coords for `position: fixed`.
+  return { left: left + rect.left, top: top + rect.top };
 }
 
 /** Update scatter point positions on zoom. */
diff --git a/packages/db/migrations/006_agentic_trace_replay.sql b/packages/db/migrations/006_agentic_trace_replay.sql
new file mode 100644
index 00000000..398bc725
--- /dev/null
+++ b/packages/db/migrations/006_agentic_trace_replay.sql
@@ -0,0 +1,34 @@
+-- Capture raw aiperf trace files per agentic benchmark point.
+--
+-- The aiperf harness produces two per-point export files inside each
+-- `agentic_<suffix>` artifact:
+--   - profile_export.jsonl         (~2 MB raw, per-request data)
+--   - server_metrics_export.csv    (~20 KB raw, periodic Prometheus snapshots)
+--
+-- We persist them so the dashboard can later show per-request distributions,
+-- KV cache utilization over time, and conversation traces without needing to
+-- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at
+-- ~500 KB per point post-gzip the total fits comfortably without a separate
+-- blob service.
+--
+-- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK
+-- column on benchmark_results). Older, non-aiperf agentic runs simply have a
+-- NULL `trace_replay_id`.
+
+create table agentic_trace_replay (
+  id                                bigserial   primary key,
+  -- gzip(profile_export.jsonl); null when only the server metrics file existed
+  profile_export_jsonl_gz           bytea,
+  profile_export_uncompressed_size  bigint,
+  -- raw csv bytes; null when only the profile file existed
+  server_metrics_csv                bytea,
+  server_metrics_csv_size           bigint,
+  created_at                        timestamptz not null default now()
+);
+
+alter table benchmark_results
+  add column trace_replay_id bigint references agentic_trace_replay(id);
+
+create index benchmark_results_trace_replay_idx
+  on benchmark_results (trace_replay_id)
+  where trace_replay_id is not null;
diff --git a/packages/db/migrations/007_agentic_trace_server_metrics_json.sql b/packages/db/migrations/007_agentic_trace_server_metrics_json.sql
new file mode 100644
index 00000000..ba7bd095
--- /dev/null
+++ b/packages/db/migrations/007_agentic_trace_server_metrics_json.sql
@@ -0,0 +1,17 @@
+-- Add the full server-metrics time-series JSON to agentic_trace_replay.
+--
+-- The existing `server_metrics_csv` column holds aiperf's summary export —
+-- one row per metric with avg/min/max/std/p1..p99 across the entire run.
+-- That's enough for the cumulative cache-hit number but not for any
+-- "metric over time" view (KV cache utilization curve, queue depth, prefix
+-- hit rate per interval, cumulative prefill token source).
+--
+-- The harness also writes `server_metrics_export.json` which contains the
+-- raw per-scrape (~1Hz) values for every Prometheus metric over the whole
+-- benchmark window. Raw size is ~250 MB per point but it compresses ~42x
+-- to ~6 MB gzipped (text with repeated metric names + numeric values).
+-- That's the file we store here for any future time-series chart.
+
+alter table agentic_trace_replay
+  add column server_metrics_json_gz bytea,
+  add column server_metrics_json_uncompressed_size bigint;
diff --git a/packages/db/src/etl/skip-tracker.test.ts b/packages/db/src/etl/skip-tracker.test.ts
index 90ad73b7..e407db3a 100644
--- a/packages/db/src/etl/skip-tracker.test.ts
+++ b/packages/db/src/etl/skip-tracker.test.ts
@@ -9,6 +9,7 @@ describe('createSkipTracker', () => {
     expect(tracker.skips.unmappedHw).toBe(0);
     expect(tracker.skips.noIslOsl).toBe(0);
     expect(tracker.skips.dbError).toBe(0);
+    expect(tracker.skips.traceReplayMissing).toBe(0);
   });
 
   it('initializes with empty unmapped sets', () => {
diff --git a/packages/db/src/etl/skip-tracker.ts b/packages/db/src/etl/skip-tracker.ts
index 588718dd..401d197c 100644
--- a/packages/db/src/etl/skip-tracker.ts
+++ b/packages/db/src/etl/skip-tracker.ts
@@ -10,6 +10,8 @@ export interface Skips {
   noIslOsl: number;
   failedRun: number;
   dbError: number;
+  /** Agentic point whose sibling `agentic_<suffix>` artifact had no trace_replay files. */
+  traceReplayMissing: number;
 }
 
 export interface SkipSnapshot {
@@ -74,6 +76,7 @@ export function createSkipTracker(): SkipTracker {
     noIslOsl: 0,
     failedRun: 0,
     dbError: 0,
+    traceReplayMissing: 0,
   };
   const unmappedModels = new Set<string>();
   const unmappedHws = new Set<string>();
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
new file mode 100644
index 00000000..8c6d92b6
--- /dev/null
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -0,0 +1,83 @@
+/**
+ * Insert per-point aiperf trace files (`profile_export.jsonl` +
+ * `server_metrics_export.csv`) into `agentic_trace_replay` and link the new row
+ * to each provided benchmark_results row via `trace_replay_id`.
+ *
+ * Mirrors the {@link insertServerLog} idempotency contract: rows that already
+ * have a non-null `trace_replay_id` are left alone so a re-ingest doesn't
+ * duplicate the sibling blob.
+ */
+
+import { gzipSync } from 'node:zlib';
+
+import type postgres from 'postgres';
+
+type Sql = ReturnType<typeof postgres>;
+
+/**
+ * Persist the per-point trace files and link them to `benchmarkResultIds`.
+ *
+ * @param sql                 Active `postgres` connection.
+ * @param benchmarkResultIds  DB ids of the benchmark_results rows produced by
+ *                            the same `bmk_agentic_<suffix>` artifact whose
+ *                            sibling `agentic_<suffix>` directory holds these
+ *                            trace files.
+ * @param profileExportJsonl  Raw bytes of `profile_export.jsonl`, or null.
+ *                            Gzipped before storage.
+ * @param serverMetricsCsv    Raw bytes of `server_metrics_export.csv`, or null.
+ *                            Stored as-is.
+ * @param serverMetricsJson   Raw bytes of `server_metrics_export.json` —
+ *                            per-scrape time-series of every Prometheus metric.
+ *                            Optional, gzipped before storage (~42x ratio).
+ */
+export async function insertTraceReplay(
+  sql: Sql,
+  benchmarkResultIds: number[],
+  profileExportJsonl: Buffer | null,
+  serverMetricsCsv: Buffer | null,
+  serverMetricsJson: Buffer | null = null,
+): Promise<void> {
+  if (benchmarkResultIds.length === 0) return;
+  if (!profileExportJsonl && !serverMetricsCsv && !serverMetricsJson) return;
+
+  // Only link rows that don't already point at a trace_replay row — keeps
+  // re-ingest from inserting duplicate sibling blobs.
+  const unlinked = await sql<{ id: number }[]>`
+    select id from benchmark_results
+    where id = any(${sql.array(benchmarkResultIds)}::bigint[])
+      and trace_replay_id is null
+  `;
+  if (unlinked.length === 0) return;
+
+  const profileGz = profileExportJsonl ? gzipSync(profileExportJsonl) : null;
+  const profileSize = profileExportJsonl ? profileExportJsonl.length : null;
+  const csvSize = serverMetricsCsv ? serverMetricsCsv.length : null;
+  const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
+  const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
+
+  const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
+    insert into agentic_trace_replay (
+      profile_export_jsonl_gz,
+      profile_export_uncompressed_size,
+      server_metrics_csv,
+      server_metrics_csv_size,
+      server_metrics_json_gz,
+      server_metrics_json_uncompressed_size
+    )
+    values (
+      ${profileGz},
+      ${profileSize},
+      ${serverMetricsCsv},
+      ${csvSize},
+      ${metricsJsonGz},
+      ${metricsJsonSize}
+    )
+    returning id
+  `;
+
+  await sql`
+    update benchmark_results
+    set trace_replay_id = ${traceReplayId}
+    where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
+  `;
+}
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index 35183789..eeb55313 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -45,6 +45,7 @@ import {
   bulkUpsertAvailability,
   insertServerLog,
 } from './etl/benchmark-ingest';
+import { insertTraceReplay } from './etl/trace-replay-ingest';
 import { mapAggEvalRow, mapEvalRow } from './etl/eval-mapper';
 import { ingestEvalRow } from './etl/eval-ingest';
 import { mapEvalSamples } from './etl/eval-samples-mapper';
@@ -209,6 +210,14 @@ const ARTIFACT_NAMES = {
   changelog: 'changelog-metadata',
 } as const;
 
+/**
+ * Strip the `bmk_` and/or `agentic_` prefixes from an artifact directory name
+ * so the bare suffix becomes a shared key between `bmk_agentic_<suffix>` and
+ * its sibling `agentic_<suffix>` artifact.
+ */
+const stripBmkAndAgenticPrefix = (s: string): string =>
+  s.replace(/^bmk_/u, '').replace(/^agentic_/u, '');
+
 function readJson(filePath: string): unknown {
   try {
     return JSON.parse(fs.readFileSync(filePath, 'utf8'));
@@ -327,6 +336,7 @@ async function main(): Promise<void> {
   let totalSamples = 0;
   let totalSampleFiles = 0;
   let totalChangelogs = 0;
+  let totalTraceReplayLinked = 0;
 
   // ── Check for evals-only flag in changelog ────────────────────────────
   const changelogDir = path.join(artifactsDir, ARTIFACT_NAMES.changelog);
@@ -381,6 +391,56 @@ async function main(): Promise<void> {
       console.log(`  Found ${serverLogPaths.size} server log artifact(s)`);
     }
 
+    // Sibling aiperf artifacts: each `bmk_agentic_<suffix>` is paired with an
+    // `agentic_<suffix>` dir holding `profile_export.jsonl` and
+    // `server_metrics_export.csv`. The harness emits these under either a
+    // `trace_replay/` subdir (older layout) or `aiperf_artifacts/` (current).
+    // Older non-aiperf agentic runs don't ship this sibling. Key on the bare
+    // suffix so both names map to the same Map entry.
+    const TRACE_SUBDIRS = ['aiperf_artifacts', 'trace_replay'];
+    const traceReplayPaths = new Map<
+      string,
+      {
+        profileJsonl: string | null;
+        serverMetricsCsv: string | null;
+        serverMetricsJson: string | null;
+      }
+    >();
+    if (fs.existsSync(artifactsDir)) {
+      for (const d of fs.readdirSync(artifactsDir)) {
+        if (!d.startsWith('agentic_')) continue;
+        let profile: string | null = null;
+        let metrics: string | null = null;
+        let metricsJson: string | null = null;
+        for (const sub of TRACE_SUBDIRS) {
+          const dir = path.join(artifactsDir, d, sub);
+          if (!fs.existsSync(dir) || !fs.statSync(dir).isDirectory()) continue;
+          if (!profile) {
+            const p = path.join(dir, 'profile_export.jsonl');
+            if (fs.existsSync(p)) profile = p;
+          }
+          if (!metrics) {
+            const m = path.join(dir, 'server_metrics_export.csv');
+            if (fs.existsSync(m)) metrics = m;
+          }
+          if (!metricsJson) {
+            const j = path.join(dir, 'server_metrics_export.json');
+            if (fs.existsSync(j)) metricsJson = j;
+          }
+        }
+        if (!profile && !metrics && !metricsJson) continue;
+        const suffix = stripBmkAndAgenticPrefix(d);
+        traceReplayPaths.set(suffix, {
+          profileJsonl: profile,
+          serverMetricsCsv: metrics,
+          serverMetricsJson: metricsJson,
+        });
+      }
+    }
+    if (traceReplayPaths.size > 0) {
+      console.log(`  Found ${traceReplayPaths.size} trace_replay sibling artifact(s)`);
+    }
+
     const allBmkFiles = [...bmkFiles, ...allBmkDirs.flatMap((d) => findJsonFiles(d))];
     console.log(`  Found ${allBmkFiles.length} benchmark JSON file(s)`);
 
@@ -448,12 +508,42 @@ async function main(): Promise<void> {
               }
             }
           }
+
+          // Trace-replay sibling lookup for agentic points only. The aiperf
+          // harness emits `agentic_<suffix>/trace_replay/...` next to the
+          // `bmk_agentic_<suffix>` artifact we just ingested.
+          if (parentDir.startsWith('bmk_agentic_') && insertedIds.length > 0) {
+            const suffix = stripBmkAndAgenticPrefix(parentDir);
+            const trace = traceReplayPaths.get(suffix);
+            if (trace) {
+              try {
+                const profile = trace.profileJsonl ? fs.readFileSync(trace.profileJsonl) : null;
+                const metrics = trace.serverMetricsCsv
+                  ? fs.readFileSync(trace.serverMetricsCsv)
+                  : null;
+                const metricsJson = trace.serverMetricsJson
+                  ? fs.readFileSync(trace.serverMetricsJson)
+                  : null;
+                await insertTraceReplay(sql, insertedIds, profile, metrics, metricsJson);
+                totalTraceReplayLinked += insertedIds.length;
+              } catch (error: any) {
+                tracker.recordDbError(`trace_replay for ${suffix}`, error);
+              }
+            } else {
+              tracker.skips.traceReplayMissing++;
+            }
+          }
         } catch (error: any) {
           tracker.recordDbError(path.basename(file), error);
         }
       }
     }
     console.log(`  Benchmarks: +${totalNewBmk} new, ${totalDupBmk} dup`);
+    if (totalTraceReplayLinked > 0 || tracker.skips.traceReplayMissing > 0) {
+      console.log(
+        `  Trace replay: ${totalTraceReplayLinked} rows linked, ${tracker.skips.traceReplayMissing} agentic point(s) missing sibling artifact`,
+      );
+    }
 
     if (availRows.length > 0) {
       try {
diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts
index 6857f817..b4a6fb95 100644
--- a/packages/db/src/ingest-gcs-backup.ts
+++ b/packages/db/src/ingest-gcs-backup.ts
@@ -458,6 +458,8 @@ async function mapWorkflowDir(
       unmappedHw: local.skips.unmappedHw,
       noIslOsl: local.skips.noIslOsl,
       failedRun: local.skips.failedRun,
+      // GCS backup doesn't ingest aiperf trace files; counter stays 0.
+      traceReplayMissing: local.skips.traceReplayMissing,
     },
     localUnmappedModels: new Set(local.unmappedModels),
     localUnmappedHws: new Set(local.unmappedHws),
diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts
index 19527f22..785d82c4 100644
--- a/packages/db/src/json-provider.ts
+++ b/packages/db/src/json-provider.ts
@@ -273,6 +273,7 @@ function toBenchmarkRow(
   metrics?: Record<string, number>,
 ): BenchmarkRow {
   return {
+    id: br.id,
     hardware: c.hardware,
     framework: c.framework,
     model: c.model,
diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts
new file mode 100644
index 00000000..245a1170
--- /dev/null
+++ b/packages/db/src/queries/benchmark-siblings.ts
@@ -0,0 +1,132 @@
+/**
+ * Find all benchmark_results that share the same SKU (hardware + framework +
+ * model + precision + spec_method + disagg + benchmark_type + workflow_run)
+ * as the given point. Used by the detail page to render a "switch between
+ * concs / parallelisms" navigator within a single run.
+ */
+
+import type { DbClient } from '../connection.js';
+
+export interface BenchmarkSibling {
+  id: number;
+  conc: number;
+  /** "on" | "off" | null. */
+  offload_mode: string | null;
+  decode_tp: number;
+  decode_ep: number;
+  prefill_tp: number;
+  prefill_ep: number;
+  num_prefill_gpu: number;
+  num_decode_gpu: number;
+  disagg: boolean;
+  /** True if this row IS the point passed in. */
+  is_current: boolean;
+  /** Whether the row has a stored trace_replay blob (for navigation hint). */
+  has_trace: boolean;
+}
+
+export interface BenchmarkSku {
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  benchmark_type: string;
+  /** Human-readable workflow_run summary so the page header can hint at provenance. */
+  github_run_id: number;
+  date: string;
+}
+
+export interface BenchmarkSiblings {
+  sku: BenchmarkSku;
+  siblings: BenchmarkSibling[];
+}
+
+export async function getBenchmarkSiblings(
+  sql: DbClient,
+  benchmarkResultId: number,
+): Promise<BenchmarkSiblings | null> {
+  // Step 1: resolve the SKU defining fields for the requested point.
+  const seed = (await sql`
+    select
+      c.hardware, c.framework, c.model, c.precision, c.spec_method,
+      br.benchmark_type, br.workflow_run_id, br.date::text,
+      wr.github_run_id
+    from benchmark_results br
+    join configs c on c.id = br.config_id
+    join workflow_runs wr on wr.id = br.workflow_run_id
+    where br.id = ${benchmarkResultId}
+  `) as unknown as {
+    hardware: string;
+    framework: string;
+    model: string;
+    precision: string;
+    spec_method: string;
+    benchmark_type: string;
+    workflow_run_id: number;
+    date: string;
+    github_run_id: number;
+  }[];
+  const root = seed[0];
+  if (!root) return null;
+
+  // Step 2: pull every sibling row sharing the SKU within the same workflow_run.
+  const rows = (await sql`
+    select
+      br.id, br.conc, br.offload_mode,
+      c.decode_tp, c.decode_ep, c.prefill_tp, c.prefill_ep,
+      c.num_prefill_gpu, c.num_decode_gpu, c.disagg,
+      (br.trace_replay_id is not null) as has_trace
+    from benchmark_results br
+    join configs c on c.id = br.config_id
+    where br.workflow_run_id = ${root.workflow_run_id}
+      and br.benchmark_type = ${root.benchmark_type}
+      and c.hardware = ${root.hardware}
+      and c.framework = ${root.framework}
+      and c.model = ${root.model}
+      and c.precision = ${root.precision}
+      and c.spec_method = ${root.spec_method}
+    order by c.decode_tp, c.decode_ep, br.offload_mode nulls first, br.conc
+  `) as unknown as {
+    id: number;
+    conc: number;
+    offload_mode: string | null;
+    decode_tp: number;
+    decode_ep: number;
+    prefill_tp: number;
+    prefill_ep: number;
+    num_prefill_gpu: number;
+    num_decode_gpu: number;
+    disagg: boolean;
+    has_trace: boolean;
+  }[];
+
+  const siblings: BenchmarkSibling[] = rows.map((r) => ({
+    id: Number(r.id),
+    conc: r.conc,
+    offload_mode: r.offload_mode,
+    decode_tp: r.decode_tp,
+    decode_ep: r.decode_ep,
+    prefill_tp: r.prefill_tp,
+    prefill_ep: r.prefill_ep,
+    num_prefill_gpu: r.num_prefill_gpu,
+    num_decode_gpu: r.num_decode_gpu,
+    disagg: r.disagg,
+    is_current: Number(r.id) === benchmarkResultId,
+    has_trace: r.has_trace,
+  }));
+
+  return {
+    sku: {
+      hardware: root.hardware,
+      framework: root.framework,
+      model: root.model,
+      precision: root.precision,
+      spec_method: root.spec_method,
+      benchmark_type: root.benchmark_type,
+      github_run_id: Number(root.github_run_id),
+      date: root.date,
+    },
+    siblings,
+  };
+}
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 74e20380..36bb0e65 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -1,6 +1,13 @@
 import type { DbClient } from '../connection.js';
 
 export interface BenchmarkRow {
+  /**
+   * Stable per-point id from benchmark_results. Used by the frontend to look
+   * up associated detail blobs (e.g. trace_replay histograms).
+   * Number is fine in TS but it's a Postgres bigint — Date arithmetic on huge
+   * runs is hypothetically lossy, in practice well below Number.MAX_SAFE_INTEGER.
+   */
+  id: number;
   hardware: string;
   framework: string;
   model: string;
@@ -55,6 +62,7 @@ export async function getLatestBenchmarks(
     const dateFilter = exact ? sql`br.date = ${date}::date` : sql`br.date <= ${date}::date`;
     const rows = await sql`
       SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl)
+        br.id,
         c.hardware,
         c.framework,
         c.model,
@@ -95,6 +103,7 @@ export async function getLatestBenchmarks(
   // No date filter: use materialized view for instant lookups
   const rows = await sql`
     SELECT
+      lb.id,
       c.hardware,
       c.framework,
       c.model,
diff --git a/packages/db/src/queries/trace-histograms.ts b/packages/db/src/queries/trace-histograms.ts
new file mode 100644
index 00000000..c243afd8
--- /dev/null
+++ b/packages/db/src/queries/trace-histograms.ts
@@ -0,0 +1,82 @@
+/**
+ * Fetch per-request ISL/OSL arrays from stored aiperf `profile_export.jsonl`
+ * blobs (gzipped in `agentic_trace_replay.profile_export_jsonl_gz`). Caller
+ * passes the set of `benchmark_results.id`s it wants and receives one entry
+ * per id that actually has a trace_replay blob (others are silently skipped).
+ *
+ * The JSONL has one JSON object per request with the shape:
+ *   { metrics: { input_sequence_length: { value, unit }, output_sequence_length: {...}, ... } }
+ *
+ * Returns raw arrays rather than pre-binned histograms — payload stays tiny
+ * (~256 ints * 2 fields per point, ~2 KB compressed) and the frontend can bin
+ * however it wants.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import type { DbClient } from '../connection.js';
+
+export interface TraceHistogramPoint {
+  /** benchmark_results.id this entry belongs to. */
+  id: number;
+  /** Input sequence length (tokens) per completed request. */
+  isl: number[];
+  /** Output sequence length (tokens) per completed request. */
+  osl: number[];
+}
+
+export type TraceHistogramMap = Record<number, TraceHistogramPoint>;
+
+export async function getTraceHistograms(
+  sql: DbClient,
+  benchmarkResultIds: number[],
+): Promise<TraceHistogramMap> {
+  if (benchmarkResultIds.length === 0) return {};
+
+  const rows = (await sql`
+    select
+      br.id as benchmark_result_id,
+      atr.profile_export_jsonl_gz as blob
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = any(${benchmarkResultIds}::bigint[])
+      and atr.profile_export_jsonl_gz is not null
+  `) as { benchmark_result_id: number; blob: Buffer }[];
+
+  const result: TraceHistogramMap = {};
+  for (const row of rows) {
+    try {
+      const jsonl = gunzipSync(row.blob).toString('utf8');
+      const isl: number[] = [];
+      const osl: number[] = [];
+      for (const line of jsonl.split('\n')) {
+        if (!line) continue;
+        let rec: { metrics?: Record<string, { value?: number } | number> };
+        try {
+          rec = JSON.parse(line);
+        } catch {
+          continue;
+        }
+        const m = rec.metrics ?? {};
+        const islVal = readMetric(m['input_sequence_length']);
+        const oslVal = readMetric(m['output_sequence_length']);
+        if (typeof islVal === 'number' && Number.isFinite(islVal)) isl.push(islVal);
+        if (typeof oslVal === 'number' && Number.isFinite(oslVal)) osl.push(oslVal);
+      }
+      result[Number(row.benchmark_result_id)] = {
+        id: Number(row.benchmark_result_id),
+        isl,
+        osl,
+      };
+    } catch {
+      // Drop malformed blobs silently — caller treats missing ids as "no data".
+    }
+  }
+  return result;
+}
+
+function readMetric(v: { value?: number } | number | undefined): number | undefined {
+  if (v === undefined || v === null) return undefined;
+  if (typeof v === 'number') return v;
+  return v.value;
+}
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
new file mode 100644
index 00000000..822ae633
--- /dev/null
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -0,0 +1,275 @@
+/**
+ * Parse aiperf's `server_metrics_export.json` blob (gzipped in
+ * `agentic_trace_replay.server_metrics_json_gz`) and return a slim, chart-ready
+ * time-series for one benchmark point.
+ *
+ * The raw JSON has shape:
+ *   metrics: {
+ *     "<metric_name>": {
+ *       series: [
+ *         {
+ *           labels: { ... },
+ *           stats: { ... summary ... },
+ *           timeslices: [
+ *             { start_ns, end_ns, avg, min, max }            // gauges
+ *             { start_ns, end_ns, total, rate }              // counters
+ *           ]
+ *         }
+ *       ]
+ *     }
+ *   }
+ *
+ * Timeslices are ~1 Hz windows. The benchmark window can be tens of minutes
+ * (1800+ windows). We return them as `[{ t, ...}]` arrays with `t` measured
+ * in seconds from the benchmark start so the frontend doesn't need to
+ * shuffle bigint nanoseconds around.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import type { DbClient } from '../connection.js';
+
+interface GaugeSlice {
+  start_ns: number;
+  end_ns: number;
+  avg?: number;
+  min?: number;
+  max?: number;
+}
+
+interface CounterSlice {
+  start_ns: number;
+  end_ns: number;
+  total?: number;
+  rate?: number;
+}
+
+interface Series {
+  endpoint_url?: string;
+  labels?: Record<string, string>;
+  stats?: Record<string, unknown>;
+  timeslices?: (GaugeSlice & CounterSlice)[];
+}
+
+interface MetricsJson {
+  metrics?: Record<string, { type?: string; description?: string; series?: Series[] }>;
+}
+
+export interface TimeSeriesPoint {
+  /** Seconds from benchmark start. */
+  t: number;
+  value: number;
+}
+
+export interface QueueDepthPoint {
+  t: number;
+  running: number;
+  waiting: number;
+  /** Optional total — frontend can compute too. */
+  total: number;
+}
+
+export interface PointMeta {
+  id: number;
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  disagg: boolean;
+  conc: number;
+  offload_mode: string | null;
+  isl: number | null;
+  osl: number | null;
+  benchmark_type: string;
+  date: string;
+  /** GitHub Actions run URL for jumping to the source. */
+  run_url: string | null;
+  /** Cumulative end-of-run cache-hit number the dashboard already shows. */
+  server_gpu_cache_hit_rate: number | null;
+  /** Cumulative end-of-run CPU offload cache-hit. */
+  server_cpu_cache_hit_rate: number | null;
+}
+
+export interface TraceServerMetrics {
+  /** Point context — hardware, model, conc, etc. for the page header. */
+  meta: PointMeta;
+  /** ns wall-clock of the first window's start; for debugging only. */
+  startNs: number;
+  /** ns wall-clock of the last window's end. */
+  endNs: number;
+  /** Total benchmark window in seconds. */
+  durationS: number;
+  /** Number of 1Hz windows captured. */
+  timeslicesCount: number;
+  /** vllm:kv_cache_usage_perc avg per scrape, values in 0..1. */
+  kvCacheUsage: TimeSeriesPoint[];
+  /** Per-window prefix-cache hit rate computed as Δhits / Δqueries (0..1). */
+  prefixCacheHitRate: TimeSeriesPoint[];
+  /** Request queue depth: running, waiting, total per scrape. */
+  queueDepth: QueueDepthPoint[];
+  /**
+   * Per-source prompt-token counts over time (counter rate per scrape).
+   * Keyed by the value of the `source` label (typically `local_cache_hit`,
+   * `external_cache_hit`, `miss`, etc.). Plot as stacked area.
+   */
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  /** Prefill throughput: vllm:prompt_tokens rate (tokens/sec) per scrape. */
+  prefillTps: TimeSeriesPoint[];
+  /** Decode throughput: vllm:generation_tokens rate (tokens/sec) per scrape. */
+  decodeTps: TimeSeriesPoint[];
+}
+
+export async function getTraceServerMetrics(
+  sql: DbClient,
+  benchmarkResultId: number,
+): Promise<TraceServerMetrics | null> {
+  const rows = (await sql`
+    select
+      atr.server_metrics_json_gz as blob,
+      br.id, c.hardware, c.framework, c.model, c.precision, c.spec_method, c.disagg,
+      br.conc, br.offload_mode, br.isl, br.osl, br.benchmark_type,
+      br.date::text,
+      case when wr.html_url is not null then wr.html_url || '/attempts/' || wr.run_attempt else null end as run_url,
+      (br.metrics ->> 'server_gpu_cache_hit_rate')::numeric as server_gpu_cache_hit_rate,
+      (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate
+    from benchmark_results br
+    join configs c on c.id = br.config_id
+    join workflow_runs wr on wr.id = br.workflow_run_id
+    left join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = ${benchmarkResultId}
+  `) as unknown as ({ blob: Buffer | null } & PointMeta)[];
+  const row = rows[0];
+  if (!row) return null;
+  const blob = row.blob;
+  if (!blob) return null;
+  const pointMeta: PointMeta = {
+    id: Number(row.id),
+    hardware: row.hardware,
+    framework: row.framework,
+    model: row.model,
+    precision: row.precision,
+    spec_method: row.spec_method,
+    disagg: row.disagg,
+    conc: row.conc,
+    offload_mode: row.offload_mode,
+    isl: row.isl,
+    osl: row.osl,
+    benchmark_type: row.benchmark_type,
+    date: row.date,
+    run_url: row.run_url,
+    server_gpu_cache_hit_rate:
+      row.server_gpu_cache_hit_rate === null ? null : Number(row.server_gpu_cache_hit_rate),
+    server_cpu_cache_hit_rate:
+      row.server_cpu_cache_hit_rate === null ? null : Number(row.server_cpu_cache_hit_rate),
+  };
+
+  const parsed = JSON.parse(gunzipSync(blob).toString('utf8')) as MetricsJson;
+  const metrics = parsed.metrics ?? {};
+
+  const firstSeries = (name: string): Series | undefined => {
+    const s = metrics[name]?.series;
+    return s && s.length > 0 ? s[0] : undefined;
+  };
+
+  // Compute timing reference from the first gauge metric we can find.
+  let startNs = Number.POSITIVE_INFINITY;
+  let endNs = 0;
+  let timeslicesCount = 0;
+  for (const metricMeta of Object.values(metrics)) {
+    for (const s of metricMeta?.series ?? []) {
+      const ts = s.timeslices ?? [];
+      if (ts.length === 0) continue;
+      timeslicesCount = Math.max(timeslicesCount, ts.length);
+      const first = ts[0]!;
+      const last = ts.at(-1)!;
+      if (typeof first.start_ns === 'number' && first.start_ns < startNs) startNs = first.start_ns;
+      if (typeof last.end_ns === 'number' && last.end_ns > endNs) endNs = last.end_ns;
+    }
+  }
+  if (!Number.isFinite(startNs)) startNs = 0;
+  const tOf = (ns: number) => (ns - startNs) / 1e9;
+
+  // KV cache usage (gauge, 0..1)
+  const kvCacheUsage: TimeSeriesPoint[] = [];
+  const kvSeries =
+    firstSeries('vllm:kv_cache_usage_perc') ?? firstSeries('vllm:gpu_cache_usage_perc');
+  for (const ts of kvSeries?.timeslices ?? []) {
+    if (typeof ts.avg === 'number') {
+      kvCacheUsage.push({ t: tOf(ts.start_ns), value: ts.avg });
+    }
+  }
+
+  // Prefix cache hit rate per scrape (Δhits / Δqueries from counter rate).
+  // `rate` is already per-window delta; we just divide.
+  const hitsTs = firstSeries('vllm:prefix_cache_hits')?.timeslices ?? [];
+  const qsTs = firstSeries('vllm:prefix_cache_queries')?.timeslices ?? [];
+  const prefixCacheHitRate: TimeSeriesPoint[] = [];
+  const minLen = Math.min(hitsTs.length, qsTs.length);
+  for (let i = 0; i < minLen; i++) {
+    const h = hitsTs[i]!;
+    const q = qsTs[i]!;
+    if (typeof q.rate === 'number' && q.rate > 0 && typeof h.rate === 'number') {
+      prefixCacheHitRate.push({ t: tOf(h.start_ns), value: h.rate / q.rate });
+    }
+  }
+
+  // Queue depth: pair running + waiting by index.
+  const runTs = firstSeries('vllm:num_requests_running')?.timeslices ?? [];
+  const waitTs = firstSeries('vllm:num_requests_waiting')?.timeslices ?? [];
+  const queueDepth: QueueDepthPoint[] = [];
+  const qlen = Math.min(runTs.length, waitTs.length);
+  for (let i = 0; i < qlen; i++) {
+    const r = runTs[i]!;
+    const w = waitTs[i]!;
+    const running = typeof r.avg === 'number' ? r.avg : 0;
+    const waiting = typeof w.avg === 'number' ? w.avg : 0;
+    queueDepth.push({
+      t: tOf(r.start_ns),
+      running,
+      waiting,
+      total: running + waiting,
+    });
+  }
+
+  // Throughput: extract counter `rate` (already per-second delta from aiperf).
+  const counterRateSeries = (name: string): TimeSeriesPoint[] => {
+    const s = firstSeries(name);
+    if (!s) return [];
+    const out: TimeSeriesPoint[] = [];
+    for (const ts of s.timeslices ?? []) {
+      if (typeof ts.rate === 'number') out.push({ t: tOf(ts.start_ns), value: ts.rate });
+    }
+    return out;
+  };
+  const prefillTps = counterRateSeries('vllm:prompt_tokens');
+  const decodeTps = counterRateSeries('vllm:generation_tokens');
+
+  // Per-source prompt tokens — emit one TS array per source label.
+  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
+  for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
+    const labels = series.labels ?? {};
+    const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels);
+    const arr: TimeSeriesPoint[] = [];
+    for (const ts of series.timeslices ?? []) {
+      if (typeof ts.rate === 'number') {
+        arr.push({ t: tOf(ts.start_ns), value: ts.rate });
+      }
+    }
+    if (arr.length > 0) promptTokensBySource[source] = arr;
+  }
+
+  return {
+    meta: pointMeta,
+    startNs,
+    endNs,
+    durationS: endNs > startNs ? (endNs - startNs) / 1e9 : 0,
+    timeslicesCount,
+    kvCacheUsage,
+    prefixCacheHitRate,
+    queueDepth,
+    promptTokensBySource,
+    prefillTps,
+    decodeTps,
+  };
+}

From 0067bfcd72d0f57242a418e5acc1cef604135554 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 19:01:49 -0500
Subject: [PATCH 24/55] feat(agentic): hover crosshair + expand-to-dialog on
 detail charts

Refactor every chart on /inference/agentic/[id] from innerHTML string SVG
to JSX SVG so we can attach mouse handlers. New shared ChartHover overlay
renders a vertical crosshair following the cursor and a floating tooltip
listing series values at that x:
- TimeSeriesChart: linearly interpolated value per series, timestamp title
- Distribution: bin range + count + cumulative percentile under cursor
- StackedAreaChart: per-source % share at the nearest timeslice

Each chart card now has a maximize button that opens the same chart in
a Dialog at 1300x520 (vs 720x260 inline), preserving hover and all data
labels. Charts accept width/height props so they re-render appropriately
in either size.
---
 .../agentic-point/agentic-point-detail.tsx    | 334 +++++------
 .../inference/agentic-point/chart-hover.tsx   | 148 +++++
 .../inference/agentic-point/distribution.tsx  | 298 ++++++----
 .../agentic-point/expandable-chart.tsx        |  46 ++
 .../agentic-point/time-series-chart.tsx       | 525 ++++++++++++------
 5 files changed, 922 insertions(+), 429 deletions(-)
 create mode 100644 packages/app/src/components/inference/agentic-point/chart-hover.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/expandable-chart.tsx

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 3cd274ba..ee58332d 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -14,6 +14,7 @@ import {
 import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings';
 
 import { Distribution } from './distribution';
+import { ExpandableChart } from './expandable-chart';
 import { SiblingNav } from './sibling-nav';
 import {
   StackedAreaChart,
@@ -71,14 +72,11 @@ function PointSummary({ meta }: { meta: PointMeta }) {
   );
 }
 
-function ChartCard({ title, children }: { title: string; children: React.ReactNode }) {
-  return (
-    <div className="rounded-lg border border-border/40 bg-card/40 p-4">
-      <h2 className="text-sm font-semibold text-foreground mb-3">{title}</h2>
-      {children}
-    </div>
-  );
-}
+/** Sizes passed to charts for the inline (small) vs expanded (dialog) render. */
+const CHART_SIZES = {
+  inline: { width: 720, height: 260 },
+  expanded: { width: 1300, height: 520 },
+};
 
 export function AgenticPointDetail({ id }: Props) {
   const router = useRouter();
@@ -131,164 +129,178 @@ export function AgenticPointDetail({ id }: Props) {
       )}
 
       <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
-        <ChartCard title="Input sequence length distribution">
-          {hist ? (
-            <Distribution values={hist.isl} unit="tokens" />
-          ) : histQuery.isLoading ? (
-            <Skeleton />
-          ) : (
-            <Empty />
-          )}
-        </ChartCard>
-        <ChartCard title="Output sequence length distribution">
-          {hist ? (
-            <Distribution values={hist.osl} unit="tokens" />
-          ) : histQuery.isLoading ? (
-            <Skeleton />
-          ) : (
-            <Empty />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="Input sequence length distribution"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (hist) return <Distribution values={hist.isl} unit="tokens" {...size} />;
+            return histQuery.isLoading ? <Skeleton /> : <Empty />;
+          }}
+        />
+        <ExpandableChart
+          title="Output sequence length distribution"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (hist) return <Distribution values={hist.osl} unit="tokens" {...size} />;
+            return histQuery.isLoading ? <Skeleton /> : <Empty />;
+          }}
+        />
 
-        <ChartCard title="KV cache utilization over time">
-          {metrics ? (
-            <TimeSeriesChart
-              series={[
-                {
-                  name: 'GPU KV cache (avg n=50)',
-                  data: rollingAverage(metrics.kvCacheUsage, 50),
-                  rawData: metrics.kvCacheUsage,
-                  color: '#3b82f6',
-                  strokeWidth: 2,
-                },
-              ]}
-              durationS={metrics.durationS}
-              yMax={1}
-              yFmt={(v) => `${(v * 100).toFixed(0)}%`}
-              yAxisLabel="KV cache (%)"
-            />
-          ) : (
-            <Skeleton />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="KV cache utilization over time"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (!metrics) return <Skeleton />;
+            return (
+              <TimeSeriesChart
+                series={[
+                  {
+                    name: 'GPU KV cache (avg n=50)',
+                    data: rollingAverage(metrics.kvCacheUsage, 50),
+                    rawData: metrics.kvCacheUsage,
+                    color: '#3b82f6',
+                    strokeWidth: 2,
+                  },
+                ]}
+                durationS={metrics.durationS}
+                yMax={1}
+                yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+                yAxisLabel="KV cache (%)"
+                {...size}
+              />
+            );
+          }}
+        />
 
-        <ChartCard title="Request queue depth">
-          {metrics ? (
-            <TimeSeriesChart
-              series={[
-                {
-                  name: 'Running (avg n=50)',
-                  data: rollingAverage(
-                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                      t: p.t,
-                      value: p.running,
-                    })),
-                    50,
-                  ),
-                  color: '#22c55e',
-                  strokeWidth: 2,
-                },
-                {
-                  name: 'Waiting (avg n=50)',
-                  data: rollingAverage(
-                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                      t: p.t,
-                      value: p.waiting,
-                    })),
-                    50,
-                  ),
-                  color: '#ef4444',
-                  strokeWidth: 2,
-                },
-                {
-                  name: 'Total (avg n=50)',
-                  data: rollingAverage(
-                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                      t: p.t,
-                      value: p.total,
-                    })),
-                    50,
-                  ),
-                  color: '#3b82f6',
-                  strokeWidth: 2,
-                },
-              ]}
-              durationS={metrics.durationS}
-              yAxisLabel="Requests"
-            />
-          ) : (
-            <Skeleton />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="Request queue depth"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (!metrics) return <Skeleton />;
+            return (
+              <TimeSeriesChart
+                series={[
+                  {
+                    name: 'Running (avg n=50)',
+                    data: rollingAverage(
+                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                        t: p.t,
+                        value: p.running,
+                      })),
+                      50,
+                    ),
+                    color: '#22c55e',
+                    strokeWidth: 2,
+                  },
+                  {
+                    name: 'Waiting (avg n=50)',
+                    data: rollingAverage(
+                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                        t: p.t,
+                        value: p.waiting,
+                      })),
+                      50,
+                    ),
+                    color: '#ef4444',
+                    strokeWidth: 2,
+                  },
+                  {
+                    name: 'Total (avg n=50)',
+                    data: rollingAverage(
+                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                        t: p.t,
+                        value: p.total,
+                      })),
+                      50,
+                    ),
+                    color: '#3b82f6',
+                    strokeWidth: 2,
+                  },
+                ]}
+                durationS={metrics.durationS}
+                yAxisLabel="Requests"
+                {...size}
+              />
+            );
+          }}
+        />
 
-        <ChartCard title="Prefix cache hit rate per interval">
-          {metrics ? (
-            <TimeSeriesChart
-              series={[
-                {
-                  name: 'GPU (HBM, avg n=50)',
-                  data: rollingAverage(metrics.prefixCacheHitRate, 50),
-                  rawData: metrics.prefixCacheHitRate,
-                  color: '#a855f7',
-                  strokeWidth: 2,
-                },
-              ]}
-              durationS={metrics.durationS}
-              yMax={1}
-              yFmt={(v) => `${(v * 100).toFixed(0)}%`}
-              yAxisLabel="Hit rate (%)"
-            />
-          ) : (
-            <Skeleton />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="Prefix cache hit rate per interval"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (!metrics) return <Skeleton />;
+            return (
+              <TimeSeriesChart
+                series={[
+                  {
+                    name: 'GPU (HBM, avg n=50)',
+                    data: rollingAverage(metrics.prefixCacheHitRate, 50),
+                    rawData: metrics.prefixCacheHitRate,
+                    color: '#a855f7',
+                    strokeWidth: 2,
+                  },
+                ]}
+                durationS={metrics.durationS}
+                yMax={1}
+                yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+                yAxisLabel="Hit rate (%)"
+                {...size}
+              />
+            );
+          }}
+        />
 
-        <ChartCard title="Throughput (total & decode)">
-          {metrics ? (
-            (() => {
-              const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
-              return (
-                <TimeSeriesChart
-                  series={[
-                    {
-                      name: 'Total (avg n=50)',
-                      data: rollingAverage(total, 50),
-                      color: '#3b82f6',
-                      strokeWidth: 1.6,
-                    },
-                    {
-                      name: 'Decode (avg n=50)',
-                      data: rollingAverage(metrics.decodeTps, 50),
-                      color: '#f97316',
-                      strokeWidth: 1.6,
-                    },
-                    {
-                      name: 'Total running avg',
-                      data: cumulativeAverage(total),
-                      color: '#ef4444',
-                      strokeWidth: 3,
-                    },
-                  ]}
-                  durationS={metrics.durationS}
-                  yAxisLabel="Tokens / sec"
-                />
-              );
-            })()
-          ) : (
-            <Skeleton />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="Throughput (total & decode)"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (!metrics) return <Skeleton />;
+            const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
+            return (
+              <TimeSeriesChart
+                series={[
+                  {
+                    name: 'Total (avg n=50)',
+                    data: rollingAverage(total, 50),
+                    color: '#3b82f6',
+                    strokeWidth: 1.6,
+                  },
+                  {
+                    name: 'Decode (avg n=50)',
+                    data: rollingAverage(metrics.decodeTps, 50),
+                    color: '#f97316',
+                    strokeWidth: 1.6,
+                  },
+                  {
+                    name: 'Total running avg',
+                    data: cumulativeAverage(total),
+                    color: '#ef4444',
+                    strokeWidth: 3,
+                  },
+                ]}
+                durationS={metrics.durationS}
+                yAxisLabel="Tokens / sec"
+                {...size}
+              />
+            );
+          }}
+        />
 
-        <ChartCard title="Cumulative prompt token source breakdown">
-          {metrics ? (
-            <StackedAreaChart
-              sourceSeries={metrics.promptTokensBySource}
-              durationS={metrics.durationS}
-            />
-          ) : (
-            <Skeleton />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="Cumulative prompt token source breakdown"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (!metrics) return <Skeleton />;
+            return (
+              <StackedAreaChart
+                sourceSeries={metrics.promptTokensBySource}
+                durationS={metrics.durationS}
+                {...size}
+              />
+            );
+          }}
+        />
       </div>
     </div>
   );
diff --git a/packages/app/src/components/inference/agentic-point/chart-hover.tsx b/packages/app/src/components/inference/agentic-point/chart-hover.tsx
new file mode 100644
index 00000000..24270122
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/chart-hover.tsx
@@ -0,0 +1,148 @@
+'use client';
+
+import { useState, type ReactNode } from 'react';
+
+/** Vertical crosshair + floating value tooltip overlay shared by every chart. */
+export interface HoverItem {
+  /** Color swatch to render next to the label. */
+  color: string;
+  label: string;
+  value: string;
+  /** Optional faint secondary line (e.g. timestamp under main values). */
+  hint?: string;
+}
+
+interface ChartHoverProps {
+  /** Padding inside the SVG; matches the chart's CHART_PAD. */
+  pad: { top: number; right: number; bottom: number; left: number };
+  /** SVG viewBox dimensions used to render the chart. */
+  width: number;
+  height: number;
+  /**
+   * Called with the cursor's normalized x in [0..1] across the plot area.
+   * Returns `null` to hide the tooltip (e.g. cursor outside data range).
+   */
+  resolve: (xFraction: number) => { items: HoverItem[]; title?: string } | null;
+  children: ReactNode;
+}
+
+/**
+ * Wrap a chart's <svg> render to add mouse-driven crosshair + tooltip.
+ *
+ * The chart owner renders its bars / lines / axes via `children`; this wrapper
+ * adds an invisible <rect> across the plot area to capture pointer events, a
+ * vertical line that follows the cursor, and a floating tooltip on the right
+ * of the cursor (auto-flipping to the left when it would overflow).
+ */
+export function ChartHover({ pad, width, height, resolve, children }: ChartHoverProps) {
+  const [hover, setHover] = useState<{
+    xPx: number;
+    yPx: number;
+    fraction: number;
+    items: HoverItem[];
+    title?: string;
+  } | null>(null);
+
+  const innerW = width - pad.left - pad.right;
+  const innerH = height - pad.top - pad.bottom;
+
+  const onMove = (e: React.MouseEvent<SVGRectElement>) => {
+    const svg = e.currentTarget.ownerSVGElement;
+    if (!svg) return;
+    const rect = svg.getBoundingClientRect();
+    // Convert client coords → SVG viewBox coords.
+    const sx = ((e.clientX - rect.left) * width) / rect.width;
+    const sy = ((e.clientY - rect.top) * height) / rect.height;
+    const fraction = Math.max(0, Math.min(1, (sx - pad.left) / innerW));
+    const resolved = resolve(fraction);
+    if (!resolved) {
+      setHover(null);
+      return;
+    }
+    setHover({ xPx: sx, yPx: sy, fraction, items: resolved.items, title: resolved.title });
+  };
+
+  const onLeave = () => setHover(null);
+
+  return (
+    <div className="relative w-full">
+      <svg
+        viewBox={`0 0 ${width} ${height}`}
+        preserveAspectRatio="xMidYMid meet"
+        className="w-full h-auto text-foreground"
+      >
+        {children}
+        {hover && (
+          <line
+            x1={hover.xPx}
+            x2={hover.xPx}
+            y1={pad.top}
+            y2={pad.top + innerH}
+            stroke="currentColor"
+            strokeWidth={1}
+            strokeDasharray="3 3"
+            opacity={0.4}
+            pointerEvents="none"
+          />
+        )}
+        <rect
+          x={pad.left}
+          y={pad.top}
+          width={innerW}
+          height={innerH}
+          fill="transparent"
+          onMouseMove={onMove}
+          onMouseLeave={onLeave}
+        />
+      </svg>
+      {hover && hover.items.length > 0 && (
+        <HoverTooltip
+          xFraction={hover.fraction}
+          containerWidth={width}
+          padLeft={pad.left}
+          innerW={innerW}
+          title={hover.title}
+          items={hover.items}
+        />
+      )}
+    </div>
+  );
+}
+
+function HoverTooltip({
+  xFraction,
+  containerWidth,
+  padLeft,
+  innerW,
+  title,
+  items,
+}: {
+  xFraction: number;
+  containerWidth: number;
+  padLeft: number;
+  innerW: number;
+  title?: string;
+  items: HoverItem[];
+}) {
+  // Position tooltip near the crosshair as a % of the container.
+  // We flip to the cursor's left side when it would overflow the right edge.
+  const xPx = padLeft + xFraction * innerW;
+  const onRight = xPx < containerWidth * 0.55;
+  const left = onRight ? `${(xPx / containerWidth) * 100}%` : 'auto';
+  const right = onRight ? 'auto' : `${((containerWidth - xPx) / containerWidth) * 100}%`;
+  return (
+    <div
+      className="pointer-events-none absolute top-2 z-10 rounded-md border border-border bg-popover px-2 py-1.5 text-xs shadow-md"
+      style={{ left, right, marginLeft: onRight ? 8 : 0, marginRight: onRight ? 0 : 8 }}
+    >
+      {title && <div className="font-medium text-foreground mb-1">{title}</div>}
+      {items.map((it, i) => (
+        <div key={i} className="flex items-center gap-1.5 leading-tight">
+          <span className="inline-block w-2 h-2 rounded-sm" style={{ backgroundColor: it.color }} />
+          <span className="text-muted-foreground">{it.label}</span>
+          <span className="ml-auto font-medium text-foreground tabular-nums">{it.value}</span>
+        </div>
+      ))}
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/distribution.tsx b/packages/app/src/components/inference/agentic-point/distribution.tsx
index c9a563fe..685b73f3 100644
--- a/packages/app/src/components/inference/agentic-point/distribution.tsx
+++ b/packages/app/src/components/inference/agentic-point/distribution.tsx
@@ -1,140 +1,242 @@
 'use client';
 
-import { useMemo, useRef } from 'react';
+import { useMemo } from 'react';
+
+import { ChartHover, type HoverItem } from './chart-hover';
+
+const fmtNum = (n: number) =>
+  n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
 
 /**
  * Bar histogram with vertical p50/p75/p90/p95 guide lines. Designed for the
  * detail-page card — fills its container width via `viewBox` + 100% width.
+ * Hover shows the bin range + count + cumulative percentile.
  */
 export function Distribution({
   values,
   unit,
+  width = 720,
   height = 260,
 }: {
   values: readonly number[];
   unit: string;
+  width?: number;
   height?: number;
 }) {
-  const W = 720;
+  const W = width;
   const H = height;
   const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
 
-  const svgParts = useMemo(() => {
-    if (values.length === 0) return { bars: '', guides: '', legend: '', axis: '', yTicks: '' };
+  const computed = useMemo(() => {
+    if (values.length === 0) return null;
     const sorted = [...values].toSorted((a, b) => a - b);
     const min = sorted[0]!;
     const max = sorted.at(-1)!;
     const range = Math.max(1e-9, max - min);
     const innerW = W - PAD.left - PAD.right;
     const innerH = H - PAD.top - PAD.bottom;
-
-    // Sturges-ish, scaled with sample size, capped so bars stay visible.
     const nBins = Math.min(50, Math.max(15, Math.ceil(Math.sqrt(values.length))));
     const counts: number[] = Array.from({ length: nBins }, () => 0);
     for (const v of values) {
       const i = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins));
       counts[i]!++;
     }
-    const maxCount = Math.max(...counts, 1);
-    const xScale = (v: number) => PAD.left + ((v - min) / range) * innerW;
-    const barW = innerW / nBins;
-
-    const fmt = (n: number) =>
-      n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
-
-    const quantile = (q: number): number => {
-      const pos = (sorted.length - 1) * q;
-      const lo = Math.floor(pos);
-      const hi = Math.ceil(pos);
-      return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo);
-    };
-
-    const bars = counts
-      .map((c, i) => {
-        const h = (c / maxCount) * innerH;
-        const x = PAD.left + i * barW;
-        const y = PAD.top + (innerH - h);
-        return `<rect x="${x.toFixed(2)}" y="${y.toFixed(2)}" width="${Math.max(0, barW - 1).toFixed(2)}" height="${h.toFixed(2)}" fill="currentColor" opacity="0.55" />`;
-      })
-      .join('');
-
-    const GUIDES = [
-      { label: 'p50', q: 0.5, color: '#3b82f6' },
-      { label: 'p75', q: 0.75, color: '#22c55e' },
-      { label: 'p90', q: 0.9, color: '#f59e0b' },
-      { label: 'p95', q: 0.95, color: '#ef4444' },
-    ] as const;
-    const guides = GUIDES.map(({ q, color }) => {
-      const v = quantile(q);
-      const x = xScale(v);
-      return `<line x1="${x.toFixed(2)}" x2="${x.toFixed(2)}" y1="${PAD.top}" y2="${(PAD.top + innerH).toFixed(2)}" stroke="${color}" stroke-width="2" stroke-dasharray="5 3" opacity="0.95" />`;
-    }).join('');
-
-    // 4-tick x-axis: min, ~33%, ~66%, max
-    const xTickVals = [min, min + range / 3, min + (2 * range) / 3, max];
-    const axisY = PAD.top + innerH + 14;
-    const axisLine = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${(PAD.top + innerH).toFixed(2)}" y2="${(PAD.top + innerH).toFixed(2)}" stroke="currentColor" opacity="0.2" />`;
-    const xLabels = xTickVals
-      .map((v, i) => {
-        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
-        return `<text x="${xScale(v).toFixed(2)}" y="${axisY}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmt(v)}</text>`;
-      })
-      .join('');
-    const axisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">value (${unit})</text>`;
-
-    // 5-tick y-axis
-    const yTickVals = Array.from({ length: 5 }, (_, i) => (maxCount * i) / 4);
-    const yScale = (c: number) => PAD.top + (1 - c / maxCount) * innerH;
-    const yTicks = yTickVals
-      .map((v) => {
-        const y = yScale(v);
-        return `<g><line x1="${PAD.left - 4}" x2="${PAD.left}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.4" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${fmt(v)}</text></g>`;
-      })
-      .join('');
-    const yAxisLabel = `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">count</text>`;
-
-    const chipY = H - 8;
-    const chipW = innerW / GUIDES.length;
-    const legend = GUIDES.map(({ label: ql, q, color }, i) => {
-      const v = quantile(q);
-      const x = PAD.left + i * chipW;
-      return `
-      <line x1="${(x + 2).toFixed(2)}" x2="${(x + 14).toFixed(2)}" y1="${chipY - 4}" y2="${chipY - 4}" stroke="${color}" stroke-width="2" stroke-dasharray="5 3" />
-      <text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${ql} ${fmt(v)}</text>`;
-    }).join('');
-
-    return {
-      bars,
-      guides,
-      legend,
-      axis: axisLine + xLabels + axisTitle + yAxisLabel,
-      yTicks,
-    };
-  }, [values, unit, H]);
-
-  const ref = useRef<HTMLDivElement | null>(null);
-
-  if (values.length === 0) {
+    return { sorted, min, max, range, innerW, innerH, nBins, counts };
+  }, [values, W, H, PAD.bottom, PAD.left, PAD.right, PAD.top]);
+
+  if (!computed) {
     return (
       <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
     );
   }
+  const { sorted, min, max, range, innerW, innerH, nBins, counts } = computed;
+  const maxCount = Math.max(...counts, 1);
+  const xScale = (v: number) => PAD.left + ((v - min) / range) * innerW;
+  const yScale = (c: number) => PAD.top + (1 - c / maxCount) * innerH;
+  const barW = innerW / nBins;
+
+  const fmt = fmtNum;
+
+  const quantile = (q: number): number => {
+    const pos = (sorted.length - 1) * q;
+    const lo = Math.floor(pos);
+    const hi = Math.ceil(pos);
+    return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo);
+  };
+
+  const GUIDES = [
+    { label: 'p50', q: 0.5, color: '#3b82f6' },
+    { label: 'p75', q: 0.75, color: '#22c55e' },
+    { label: 'p90', q: 0.9, color: '#f59e0b' },
+    { label: 'p95', q: 0.95, color: '#ef4444' },
+  ] as const;
+
+  // Hover: report the bin range under cursor, its count, and what percentile
+  // the bin's midpoint represents in the empirical distribution.
+  const resolve = (fraction: number) => {
+    const v = min + fraction * range;
+    const binIdx = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins));
+    const binLo = min + (binIdx * range) / nBins;
+    const binHi = min + ((binIdx + 1) * range) / nBins;
+    const count = counts[binIdx] ?? 0;
+    // Cumulative % at the bin's right edge.
+    let cumCount = 0;
+    for (let i = 0; i <= binIdx; i++) cumCount += counts[i] ?? 0;
+    const cumPct = (cumCount / values.length) * 100;
+    const items: HoverItem[] = [
+      { color: 'currentColor', label: 'Bin', value: `${fmt(binLo)}–${fmt(binHi)} ${unit}` },
+      { color: 'currentColor', label: 'Count', value: count.toLocaleString() },
+      { color: 'currentColor', label: 'Cumulative', value: `${cumPct.toFixed(1)}%` },
+    ];
+    return { items };
+  };
+
+  const xTickVals = [min, min + range / 3, min + (2 * range) / 3, max];
+  const yTickVals = Array.from({ length: 5 }, (_, i) => (maxCount * i) / 4);
 
   return (
-    <div ref={ref} className="w-full">
+    <div className="w-full">
       <div className="mb-2 text-xs text-muted-foreground">
-        {values.length.toLocaleString()} requests · range {Math.round(Math.min(...values))}–
-        {Math.round(Math.max(...values))} {unit}
+        {values.length.toLocaleString()} requests · range {fmt(min)}–{fmt(max)} {unit}
       </div>
-      <svg
-        viewBox={`0 0 ${W} ${H}`}
-        preserveAspectRatio="xMidYMid meet"
-        className="w-full h-auto text-foreground"
-        dangerouslySetInnerHTML={{
-          __html:
-            svgParts.bars + svgParts.guides + svgParts.axis + svgParts.yTicks + svgParts.legend,
-        }}
-      />
+      <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+        {/* y-axis gridlines + labels */}
+        {yTickVals.map((v, i) => {
+          const y = yScale(v);
+          return (
+            <g key={`y${i}`}>
+              <line
+                x1={PAD.left - 4}
+                x2={PAD.left}
+                y1={y}
+                y2={y}
+                stroke="currentColor"
+                opacity={0.4}
+              />
+              <text
+                x={PAD.left - 8}
+                y={y + 3}
+                fontSize={10}
+                fill="currentColor"
+                opacity={0.55}
+                textAnchor="end"
+              >
+                {fmt(v)}
+              </text>
+            </g>
+          );
+        })}
+
+        {/* Bars */}
+        {counts.map((c, i) => {
+          const h = (c / maxCount) * innerH;
+          const x = PAD.left + i * barW;
+          const y = PAD.top + (innerH - h);
+          return (
+            <rect
+              key={i}
+              x={x}
+              y={y}
+              width={Math.max(0, barW - 1)}
+              height={h}
+              fill="currentColor"
+              opacity={0.55}
+            />
+          );
+        })}
+
+        {/* Percentile guide lines */}
+        {GUIDES.map(({ q, color }) => {
+          const v = quantile(q);
+          const x = xScale(v);
+          return (
+            <line
+              key={q}
+              x1={x}
+              x2={x}
+              y1={PAD.top}
+              y2={PAD.top + innerH}
+              stroke={color}
+              strokeWidth={2}
+              strokeDasharray="5 3"
+              opacity={0.95}
+            />
+          );
+        })}
+
+        {/* X axis */}
+        <line
+          x1={PAD.left}
+          x2={PAD.left + innerW}
+          y1={PAD.top + innerH}
+          y2={PAD.top + innerH}
+          stroke="currentColor"
+          opacity={0.2}
+        />
+        {xTickVals.map((v, i) => {
+          const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+          return (
+            <text
+              key={`x${i}`}
+              x={xScale(v)}
+              y={PAD.top + innerH + 14}
+              fontSize={11}
+              fill="currentColor"
+              opacity={0.7}
+              textAnchor={anchor}
+            >
+              {fmt(v)}
+            </text>
+          );
+        })}
+        <text
+          x={W / 2}
+          y={H - 22}
+          fontSize={11}
+          fill="currentColor"
+          opacity={0.55}
+          textAnchor="middle"
+        >
+          value ({unit})
+        </text>
+        <text
+          x={10}
+          y={H / 2}
+          fontSize={11}
+          fill="currentColor"
+          opacity={0.55}
+          textAnchor="middle"
+          transform={`rotate(-90 10 ${H / 2})`}
+        >
+          count
+        </text>
+
+        {/* Percentile legend chips */}
+        {(() => {
+          const chipY = H - 8;
+          const chipW = innerW / GUIDES.length;
+          return GUIDES.map(({ label: ql, q, color }, i) => {
+            const v = quantile(q);
+            const x = PAD.left + i * chipW;
+            return (
+              <g key={ql}>
+                <line
+                  x1={x + 2}
+                  x2={x + 14}
+                  y1={chipY - 4}
+                  y2={chipY - 4}
+                  stroke={color}
+                  strokeWidth={2}
+                  strokeDasharray="5 3"
+                />
+                <text x={x + 18} y={chipY} fontSize={11} fill="currentColor" opacity={0.9}>
+                  {ql} {fmt(v)}
+                </text>
+              </g>
+            );
+          });
+        })()}
+      </ChartHover>
     </div>
   );
 }
diff --git a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
new file mode 100644
index 00000000..7c8e4538
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
@@ -0,0 +1,46 @@
+'use client';
+
+import { useState, type ReactNode } from 'react';
+import { Maximize2 } from 'lucide-react';
+
+import { Dialog, DialogContent, DialogHeader, DialogTitle } from '@/components/ui/dialog';
+
+/**
+ * Wraps a chart in a card with a header + expand button. Click the button to
+ * open the chart in a large dialog. The `render` prop receives `expanded:true`
+ * inside the dialog so charts can pick larger width/height.
+ */
+export function ExpandableChart({
+  title,
+  render,
+}: {
+  title: string;
+  render: (expanded: boolean) => ReactNode;
+}) {
+  const [open, setOpen] = useState(false);
+
+  return (
+    <div className="rounded-lg border border-border/40 bg-card/40 p-4">
+      <div className="flex items-start justify-between mb-3 gap-2">
+        <h2 className="text-sm font-semibold text-foreground">{title}</h2>
+        <button
+          type="button"
+          aria-label="Expand chart"
+          onClick={() => setOpen(true)}
+          className="text-muted-foreground hover:text-foreground transition-colors"
+        >
+          <Maximize2 className="size-4" />
+        </button>
+      </div>
+      {render(false)}
+      <Dialog open={open} onOpenChange={setOpen}>
+        <DialogContent className="max-w-[min(96vw,1400px)] w-[min(96vw,1400px)]">
+          <DialogHeader>
+            <DialogTitle>{title}</DialogTitle>
+          </DialogHeader>
+          <div className="w-full">{render(true)}</div>
+        </DialogContent>
+      </Dialog>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index bc081b4e..cd10aff7 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -4,6 +4,8 @@ import { useMemo } from 'react';
 
 import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics';
 
+import { ChartHover, type HoverItem } from './chart-hover';
+
 interface Series {
   name: string;
   /** The line to draw (caller pre-smooths if desired). */
@@ -21,6 +23,7 @@ interface TimeSeriesChartProps {
   yMax?: number;
   yFmt?: (v: number) => string;
   yAxisLabel?: string;
+  width?: number;
   height?: number;
 }
 
@@ -43,10 +46,7 @@ export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): Tim
   return out;
 }
 
-/**
- * Expanding-window cumulative mean from index 0..i. Useful for "running
- * average over the entire run" lines (red overlay in the throughput chart).
- */
+/** Expanding-window cumulative mean from index 0..i. */
 export function cumulativeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
   if (data.length === 0) return data;
   const out: TimeSeriesPoint[] = Array.from({ length: data.length });
@@ -68,7 +68,7 @@ export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSerie
   return out;
 }
 
-const fmtInt = (n: number) =>
+const fmtIntDefault = (n: number) =>
   n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
 
 const fmtSeconds = (s: number) => {
@@ -78,97 +78,72 @@ const fmtSeconds = (s: number) => {
   return `${m}m ${rem}s`;
 };
 
+/** Linear-interpolated value at time `t` from a time-sorted series. */
+function interpAt(data: TimeSeriesPoint[], t: number): number | null {
+  if (data.length === 0) return null;
+  if (t <= data[0]!.t) return data[0]!.value;
+  if (t >= data.at(-1)!.t) return data.at(-1)!.value;
+  // Binary search
+  let lo = 0;
+  let hi = data.length - 1;
+  while (hi - lo > 1) {
+    const mid = (lo + hi) >> 1;
+    if (data[mid]!.t <= t) lo = mid;
+    else hi = mid;
+  }
+  const a = data[lo]!;
+  const b = data[hi]!;
+  if (b.t === a.t) return a.value;
+  const frac = (t - a.t) / (b.t - a.t);
+  return a.value + (b.value - a.value) * frac;
+}
+
 export function TimeSeriesChart({
   series,
   durationS,
   yMax: yMaxOpt,
-  yFmt = fmtInt,
+  yFmt = fmtIntDefault,
   yAxisLabel,
+  width = 720,
   height = 260,
 }: TimeSeriesChartProps) {
-  const W = 720;
+  const W = width;
   const H = height;
   const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
 
-  const inner = useMemo(() => {
+  const layout = useMemo(() => {
     const innerW = W - PAD.left - PAD.right;
     const innerH = H - PAD.top - PAD.bottom;
     const xMax = Math.max(durationS, 1);
     const yMax = yMaxOpt ?? Math.max(1e-9, ...series.flatMap((s) => s.data.map((d) => d.value)));
     const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
     const yScale = (v: number) => PAD.top + (1 - v / yMax) * innerH;
-
-    const subsample = (arr: TimeSeriesPoint[]) => {
-      if (arr.length === 0) return arr;
-      const stride = Math.max(1, Math.floor(arr.length / innerW));
-      return stride > 1 ? arr.filter((_, i) => i % stride === 0) : arr;
-    };
-
-    // Layered render: raw scatter (back) → lines (front). Iterate twice so
-    // emphasis lines (high strokeWidth) draw over everything else.
-    const dotsLayer = series
-      .filter((s) => s.rawData && s.rawData.length > 0)
-      .map((s) =>
-        subsample(s.rawData!)
-          .map((d) => {
-            const x = xScale(d.t);
-            const y = yScale(d.value);
-            return `<circle cx="${x.toFixed(2)}" cy="${y.toFixed(2)}" r="1.5" fill="${s.color}" opacity="0.2" />`;
-          })
-          .join(''),
-      )
-      .join('');
-
-    const lineLayer = series
-      .map((s) => {
-        if (s.data.length === 0) return '';
-        const sampled = subsample(s.data);
-        const pts = sampled.map((d) => [xScale(d.t), yScale(d.value)] as [number, number]);
-        const path = pts
-          .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
-          .join(' ');
-        return `<path d="${path}" fill="none" stroke="${s.color}" stroke-width="${s.strokeWidth ?? 1.8}" />`;
-      })
-      .join('');
-
-    const paths = dotsLayer + lineLayer;
-
-    // X-axis: 5 ticks at 0..xMax
-    const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
-    const axisY = PAD.top + innerH;
-    const xAxis = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${axisY.toFixed(2)}" y2="${axisY.toFixed(2)}" stroke="currentColor" opacity="0.2" />${xTickVals
-      .map((v, i) => {
-        const x = xScale(v);
-        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
-        return `<text x="${x.toFixed(2)}" y="${(axisY + 14).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmtSeconds(v)}</text>`;
-      })
-      .join('')}`;
-    const xAxisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">time</text>`;
-
-    // Y-axis: 5 ticks at 0..yMax
-    const yTickVals = Array.from({ length: 5 }, (_, i) => (yMax * i) / 4);
-    const yTicks = yTickVals
-      .map((v) => {
-        const y = yScale(v);
-        return `<g><line x1="${PAD.left - 4}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.08" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${yFmt(v)}</text></g>`;
-      })
-      .join('');
-    const yAxisTitle = yAxisLabel
-      ? `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">${yAxisLabel}</text>`
-      : '';
-
-    // Legend at the bottom of the SVG
-    const chipY = H - 8;
-    const chipW = innerW / Math.max(1, series.length);
-    const legend = series
-      .map((s, i) => {
-        const x = PAD.left + i * chipW;
-        return `<line x1="${(x + 2).toFixed(2)}" x2="${(x + 14).toFixed(2)}" y1="${chipY - 4}" y2="${chipY - 4}" stroke="${s.color}" stroke-width="${s.strokeWidth ?? 2}" /><text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${s.name}</text>`;
-      })
-      .join('');
-
-    return paths + xAxis + xAxisTitle + yTicks + yAxisTitle + legend;
-  }, [series, durationS, yMaxOpt, yFmt, yAxisLabel, H]);
+    return { innerW, innerH, xMax, yMax, xScale, yScale };
+  }, [series, durationS, yMaxOpt, W, H, PAD.bottom, PAD.left, PAD.right, PAD.top]);
+
+  const { innerW, innerH, xMax, yMax, xScale, yScale } = layout;
+
+  const subsample = (arr: TimeSeriesPoint[]) => {
+    if (arr.length === 0) return arr;
+    const stride = Math.max(1, Math.floor(arr.length / innerW));
+    return stride > 1 ? arr.filter((_, i) => i % stride === 0) : arr;
+  };
+
+  // Pre-format axis ticks.
+  const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+  const yTickVals = Array.from({ length: 5 }, (_, i) => (yMax * i) / 4);
+
+  const resolve = (fraction: number) => {
+    const t = fraction * xMax;
+    const items: HoverItem[] = [];
+    for (const s of series) {
+      const v = interpAt(s.data, t);
+      if (v === null || !Number.isFinite(v)) continue;
+      items.push({ color: s.color, label: s.name, value: yFmt(v) });
+    }
+    if (items.length === 0) return null;
+    return { items, title: fmtSeconds(t) };
+  };
 
   if (series.every((s) => s.data.length === 0)) {
     return (
@@ -177,12 +152,146 @@ export function TimeSeriesChart({
   }
 
   return (
-    <svg
-      viewBox={`0 0 ${W} ${H}`}
-      preserveAspectRatio="xMidYMid meet"
-      className="w-full h-auto text-foreground"
-      dangerouslySetInnerHTML={{ __html: inner }}
-    />
+    <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+      {/* y-axis gridlines + labels */}
+      {yTickVals.map((v, i) => {
+        const y = yScale(v);
+        return (
+          <g key={`y${i}`}>
+            <line
+              x1={PAD.left - 4}
+              x2={PAD.left + innerW}
+              y1={y}
+              y2={y}
+              stroke="currentColor"
+              opacity={0.08}
+            />
+            <text
+              x={PAD.left - 8}
+              y={y + 3}
+              fontSize={10}
+              fill="currentColor"
+              opacity={0.55}
+              textAnchor="end"
+            >
+              {yFmt(v)}
+            </text>
+          </g>
+        );
+      })}
+
+      {/* Raw scatter underlay */}
+      {series
+        .filter((s) => s.rawData && s.rawData.length > 0)
+        .map((s, si) =>
+          subsample(s.rawData!).map((d, i) => (
+            <circle
+              key={`r${si}-${i}`}
+              cx={xScale(d.t)}
+              cy={yScale(d.value)}
+              r={1.5}
+              fill={s.color}
+              opacity={0.2}
+            />
+          )),
+        )}
+
+      {/* Lines */}
+      {series.map((s, si) => {
+        if (s.data.length === 0) return null;
+        const sampled = subsample(s.data);
+        const path = sampled
+          .map(
+            (d, i) =>
+              `${i === 0 ? 'M' : 'L'}${xScale(d.t).toFixed(2)},${yScale(d.value).toFixed(2)}`,
+          )
+          .join(' ');
+        return (
+          <path
+            key={`l${si}`}
+            d={path}
+            fill="none"
+            stroke={s.color}
+            strokeWidth={s.strokeWidth ?? 1.8}
+          />
+        );
+      })}
+
+      {/* X-axis */}
+      <line
+        x1={PAD.left}
+        x2={PAD.left + innerW}
+        y1={PAD.top + innerH}
+        y2={PAD.top + innerH}
+        stroke="currentColor"
+        opacity={0.2}
+      />
+      {xTickVals.map((v, i) => {
+        const x = xScale(v);
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return (
+          <text
+            key={`x${i}`}
+            x={x}
+            y={PAD.top + innerH + 14}
+            fontSize={11}
+            fill="currentColor"
+            opacity={0.7}
+            textAnchor={anchor}
+          >
+            {fmtSeconds(v)}
+          </text>
+        );
+      })}
+      <text
+        x={W / 2}
+        y={H - 22}
+        fontSize={11}
+        fill="currentColor"
+        opacity={0.55}
+        textAnchor="middle"
+      >
+        time
+      </text>
+
+      {yAxisLabel && (
+        <text
+          x={10}
+          y={H / 2}
+          fontSize={11}
+          fill="currentColor"
+          opacity={0.55}
+          textAnchor="middle"
+          transform={`rotate(-90 10 ${H / 2})`}
+        >
+          {yAxisLabel}
+        </text>
+      )}
+
+      {/* Legend */}
+      {(() => {
+        const chipY = H - 8;
+        const chipW = innerW / Math.max(1, series.length);
+        return series.map((s, i) => {
+          const x = PAD.left + i * chipW;
+          return (
+            <g key={`leg${i}`}>
+              <line
+                x1={x + 2}
+                x2={x + 14}
+                y1={chipY - 4}
+                y2={chipY - 4}
+                stroke={s.color}
+                strokeWidth={s.strokeWidth ?? 2}
+              />
+              <text x={x + 18} y={chipY} fontSize={11} fill="currentColor" opacity={0.9}>
+                {s.name}
+              </text>
+            </g>
+          );
+        });
+      })()}
+    </ChartHover>
   );
 }
 
@@ -190,19 +299,21 @@ export function TimeSeriesChart({
 export function StackedAreaChart({
   sourceSeries,
   durationS,
+  width = 720,
   height = 260,
 }: {
   sourceSeries: Record<string, TimeSeriesPoint[]>;
   durationS: number;
+  width?: number;
   height?: number;
 }) {
-  const W = 720;
+  const W = width;
   const H = height;
   const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
 
-  const inner = useMemo(() => {
+  const computed = useMemo(() => {
     const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0);
-    if (entries.length === 0) return '';
+    if (entries.length === 0) return null;
     const tValues = entries[0]![1].map((p) => p.t);
     const cum: Record<string, number[]> = {};
     for (const [name, arr] of entries) {
@@ -220,92 +331,166 @@ export function StackedAreaChart({
         shares[name]!.push(total > 0 ? (cum[name]?.[i] ?? 0) / total : 0);
       }
     }
-
-    const colors: Record<string, string> = {
-      local_compute: '#f97316',
-      local_cache_hit: '#3b82f6',
-      external_kv_transfer: '#22c55e',
-      miss: '#f97316',
-    };
-    const labelFor: Record<string, string> = {
-      local_compute: 'Prefill',
-      local_cache_hit: 'HBM Cache Hit',
-      external_kv_transfer: 'Offload Cache Hit',
-      miss: 'Miss',
-    };
-
-    const innerW = W - PAD.left - PAD.right;
-    const innerH = H - PAD.top - PAD.bottom;
-    const xMax = Math.max(durationS, 1);
-    const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
-    const yScale = (v: number) => PAD.top + (1 - v) * innerH;
-
-    const stackOrder = Object.keys(shares);
-    const lower: number[] = Array.from({ length: tValues.length }, () => 0);
-    const layers = stackOrder.map((name) => {
-      const upper = shares[name]!.map((v, i) => lower[i]! + v);
-      const top = upper.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
-      const bottom = lower.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
-      const d = `${top
-        .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
-        .join(' ')} ${[...bottom]
-        .toReversed()
-        .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`)
-        .join(' ')} Z`;
-      const color = colors[name] ?? '#6b7280';
-      const path = `<path d="${d}" fill="${color}" opacity="0.75" />`;
-      for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!;
-      return { name, color, path };
-    });
-
-    const paths = layers.map((l) => l.path).join('');
-
-    // X-axis
-    const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
-    const axisY = PAD.top + innerH;
-    const xAxis = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${axisY.toFixed(2)}" y2="${axisY.toFixed(2)}" stroke="currentColor" opacity="0.2" />${xTickVals
-      .map((v, i) => {
-        const x = xScale(v);
-        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
-        return `<text x="${x.toFixed(2)}" y="${(axisY + 14).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmtSeconds(v)}</text>`;
-      })
-      .join('')}`;
-    const xAxisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">time</text>`;
-
-    // Y-axis 0..100%
-    const yTickVals = [0, 0.25, 0.5, 0.75, 1];
-    const yTicks = yTickVals
-      .map((v) => {
-        const y = yScale(v);
-        return `<g><line x1="${PAD.left - 4}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.08" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${(v * 100).toFixed(0)}%</text></g>`;
-      })
-      .join('');
-    const yAxisTitle = `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">% of prefill tokens</text>`;
-
-    const chipY = H - 8;
-    const chipW = innerW / Math.max(1, layers.length);
-    const legend = layers
-      .map((l, i) => {
-        const x = PAD.left + i * chipW;
-        return `<rect x="${(x + 2).toFixed(2)}" y="${chipY - 9}" width="12" height="8" fill="${l.color}" opacity="0.75" /><text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${labelFor[l.name] ?? l.name}</text>`;
-      })
-      .join('');
-
-    return paths + xAxis + xAxisTitle + yTicks + yAxisTitle + legend;
-  }, [sourceSeries, durationS, H]);
-
-  if (Object.values(sourceSeries).every((v) => v.length === 0)) {
+    return { tValues, shares };
+  }, [sourceSeries]);
+
+  const colors: Record<string, string> = {
+    local_compute: '#f97316',
+    local_cache_hit: '#3b82f6',
+    external_kv_transfer: '#22c55e',
+    miss: '#f97316',
+  };
+  const labelFor: Record<string, string> = {
+    local_compute: 'Prefill',
+    local_cache_hit: 'HBM Cache Hit',
+    external_kv_transfer: 'Offload Cache Hit',
+    miss: 'Miss',
+  };
+
+  if (!computed) {
     return (
       <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
     );
   }
+  const { tValues, shares } = computed;
+
+  const innerW = W - PAD.left - PAD.right;
+  const innerH = H - PAD.top - PAD.bottom;
+  const xMax = Math.max(durationS, 1);
+  const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
+  const yScale = (v: number) => PAD.top + (1 - v) * innerH;
+
+  const stackOrder = Object.keys(shares);
+  const lower: number[] = Array.from({ length: tValues.length }, () => 0);
+  const layers = stackOrder.map((name) => {
+    const upper = shares[name]!.map((v, i) => lower[i]! + v);
+    const top = upper.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+    const bottom = lower.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+    const d = `${top
+      .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
+      .join(' ')} ${[...bottom]
+      .toReversed()
+      .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`)
+      .join(' ')} Z`;
+    const color = colors[name] ?? '#6b7280';
+    for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!;
+    return { name, color, d };
+  });
+
+  const resolve = (fraction: number) => {
+    const t = fraction * xMax;
+    // Find the closest tValue index.
+    let idx = 0;
+    let bestDist = Infinity;
+    for (let i = 0; i < tValues.length; i++) {
+      const d = Math.abs(tValues[i]! - t);
+      if (d < bestDist) {
+        bestDist = d;
+        idx = i;
+      }
+    }
+    const items: HoverItem[] = stackOrder.map((name) => ({
+      color: colors[name] ?? '#6b7280',
+      label: labelFor[name] ?? name,
+      value: `${((shares[name]?.[idx] ?? 0) * 100).toFixed(1)}%`,
+    }));
+    return { items, title: fmtSeconds(t) };
+  };
+
+  const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+  const yTickVals = [0, 0.25, 0.5, 0.75, 1];
 
   return (
-    <svg
-      viewBox={`0 0 ${W} ${H}`}
-      preserveAspectRatio="xMidYMid meet"
-      className="w-full h-auto text-foreground"
-      dangerouslySetInnerHTML={{ __html: inner }}
-    />
+    <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+      {yTickVals.map((v, i) => {
+        const y = yScale(v);
+        return (
+          <g key={`y${i}`}>
+            <line
+              x1={PAD.left - 4}
+              x2={PAD.left + innerW}
+              y1={y}
+              y2={y}
+              stroke="currentColor"
+              opacity={0.08}
+            />
+            <text
+              x={PAD.left - 8}
+              y={y + 3}
+              fontSize={10}
+              fill="currentColor"
+              opacity={0.55}
+              textAnchor="end"
+            >
+              {(v * 100).toFixed(0)}%
+            </text>
+          </g>
+        );
+      })}
+      {layers.map((l, i) => (
+        <path key={i} d={l.d} fill={l.color} opacity={0.75} />
+      ))}
+      <line
+        x1={PAD.left}
+        x2={PAD.left + innerW}
+        y1={PAD.top + innerH}
+        y2={PAD.top + innerH}
+        stroke="currentColor"
+        opacity={0.2}
+      />
+      {xTickVals.map((v, i) => {
+        const x = xScale(v);
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return (
+          <text
+            key={`x${i}`}
+            x={x}
+            y={PAD.top + innerH + 14}
+            fontSize={11}
+            fill="currentColor"
+            opacity={0.7}
+            textAnchor={anchor}
+          >
+            {fmtSeconds(v)}
+          </text>
+        );
+      })}
+      <text
+        x={W / 2}
+        y={H - 22}
+        fontSize={11}
+        fill="currentColor"
+        opacity={0.55}
+        textAnchor="middle"
+      >
+        time
+      </text>
+      <text
+        x={10}
+        y={H / 2}
+        fontSize={11}
+        fill="currentColor"
+        opacity={0.55}
+        textAnchor="middle"
+        transform={`rotate(-90 10 ${H / 2})`}
+      >
+        % of prefill tokens
+      </text>
+      {(() => {
+        const chipY = H - 8;
+        const chipW = innerW / Math.max(1, layers.length);
+        return layers.map((l, i) => {
+          const x = PAD.left + i * chipW;
+          return (
+            <g key={`leg${i}`}>
+              <rect x={x + 2} y={chipY - 9} width={12} height={8} fill={l.color} opacity={0.75} />
+              <text x={x + 18} y={chipY} fontSize={11} fill="currentColor" opacity={0.9}>
+                {labelFor[l.name] ?? l.name}
+              </text>
+            </g>
+          );
+        });
+      })()}
+    </ChartHover>
   );
 }

From 1d502ac198495147ef579140121a3e49a9f4349f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 20:09:55 -0500
Subject: [PATCH 25/55] feat(inference): one chart with TTFT / E2E /
 Interactivity x-axis picker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the always-rendered pair of charts (interactivity + e2e) with a
single chart whose x-axis is chosen by big pill-shaped buttons above the
card. Three options: TTFT (e2e chart with x = p90_ttft), E2E Latency
(e2e chart with x = median_e2el / p90_e2el), Interactivity (interactivity
chart). The inline E2E dropdown is removed — the buttons replace it.

Mode is persisted to ?i_xmode= and defaults by scenario kind:
  agentic   → TTFT
  fixed-seq → Interactivity

Initial state is SSR-stable (always reads URL only) and a post-mount
effect snaps to the kind default if no URL value was provided. The same
effect re-snaps on subsequent sequence-kind switches. The mode setter
also keeps selectedE2eXAxisMetric aligned so the existing useChartData
pipeline resolves the right x-axis for the e2e chart variant.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/app/cypress/support/mock-data.ts     |   2 +
 .../components/inference/InferenceContext.tsx |  45 ++++-
 .../app/src/components/inference/types.ts     |   9 +
 .../components/inference/ui/ChartDisplay.tsx  | 162 ++++++++----------
 packages/app/src/lib/url-state.ts             |   2 +
 5 files changed, 130 insertions(+), 90 deletions(-)

diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts
index 34b89aba..2d3c982f 100644
--- a/packages/app/cypress/support/mock-data.ts
+++ b/packages/app/cypress/support/mock-data.ts
@@ -195,6 +195,8 @@ export function createMockInferenceContext(
     setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'),
     selectedE2eXAxisMetric: null,
     setSelectedE2eXAxisMetric: namedStub('setSelectedE2eXAxisMetric'),
+    selectedXAxisMode: 'interactivity',
+    setSelectedXAxisMode: namedStub('setSelectedXAxisMode'),
     scaleType: 'auto',
     setScaleType: namedStub('setScaleType'),
     isLegendExpanded: true,
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index c80afc2e..00ea316c 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -43,7 +43,7 @@ import {
 import { useUrlState } from '@/hooks/useUrlState';
 import { buildAvailabilityHwKey } from '@/lib/chart-utils';
 import { getHardwareConfig, getModelSortIndex, isKnownGpu, TABLEAU_10 } from '@/lib/constants';
-import { hasMtpEngineExclusion, MODEL_PREFIX_MAPPING } from '@/lib/data-mappings';
+import { hasMtpEngineExclusion, MODEL_PREFIX_MAPPING, sequenceKind } from '@/lib/data-mappings';
 import {
   MtpEngineConflictToast,
   type MtpEngineConflictDetail,
@@ -133,6 +133,26 @@ export function InferenceProvider({
   const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState<string | null>(
     () => getUrlParam('i_e2e_xmetric') || 'p90_ttft',
   );
+  // Selected chart variant. Initialize from URL only — SSR cannot read URL, so
+  // computing a kind-based default here would diverge between server and client
+  // and cause a hydration mismatch. The scenario-kind default is applied in a
+  // post-mount effect below (and a ref tracks whether the user has overridden).
+  const urlXMode = (() => {
+    const v = getUrlParam('i_xmode');
+    return v === 'ttft' || v === 'e2e' || v === 'interactivity' ? v : null;
+  })();
+  const [selectedXAxisMode, setSelectedXAxisMode] = useState<'ttft' | 'e2e' | 'interactivity'>(
+    urlXMode ?? 'ttft',
+  );
+  const xAxisModeFromUrlRef = useRef(urlXMode !== null);
+  // Wrap the setter so a button click also aligns selectedE2eXAxisMetric — the
+  // existing useChartData pipeline keys off that flag for the e2e chart's x-axis.
+  const handleSetXAxisMode = useCallback((mode: 'ttft' | 'e2e' | 'interactivity') => {
+    xAxisModeFromUrlRef.current = true;
+    setSelectedXAxisMode(mode);
+    if (mode === 'ttft') setSelectedE2eXAxisMetric('p90_ttft');
+    else if (mode === 'e2e') setSelectedE2eXAxisMetric(null);
+  }, []);
   // Latency percentile applied to the chart x-axis for agentic scenarios.
   // Values: 'p90' | 'p99'. Non-agentic charts ignore.
   const [selectedPercentile, setSelectedPercentile] = useState<string>(
@@ -325,6 +345,24 @@ export function InferenceProvider({
     setTrackedConfigs((prev) => (prev.length > 0 ? [] : prev));
   }, [selectedModel, effectiveSequence, effectivePrecisions, selectedYAxisMetric]);
 
+  // Reconcile the x-axis mode with the scenario kind:
+  //  - On mount with no `i_xmode` URL param: snap to the kind's natural default
+  //    (agentic → ttft, fixed → interactivity). The state itself was initialized
+  //    to a SSR-stable constant so server and client render the same DOM; this
+  //    effect fixes it up after hydration.
+  //  - When the user later switches sequence kinds: snap to the new kind's
+  //    natural default (the prior selection was for a different kind, so it
+  //    doesn't carry over).
+  const lastSeqKindRef = useRef<ReturnType<typeof sequenceKind> | null>(null);
+  useEffect(() => {
+    const kind = sequenceKind(effectiveSequence);
+    const isInitialMount = lastSeqKindRef.current === null;
+    if (!isInitialMount && lastSeqKindRef.current === kind) return;
+    lastSeqKindRef.current = kind;
+    if (isInitialMount && xAxisModeFromUrlRef.current) return;
+    handleSetXAxisMode(kind === 'agentic' ? 'ttft' : 'interactivity');
+  }, [effectiveSequence, handleSetXAxisMode]);
+
   // Ref guard: when true, filter changes don't clear the active preset.
   // FavoritePresetsDropdown sets this while applying a preset so its own
   // programmatic setter calls don't accidentally deactivate it.
@@ -785,6 +823,7 @@ export function InferenceProvider({
       i_log: logScale ? '1' : '',
       i_xmetric: selectedXAxisMetric || '',
       i_e2e_xmetric: selectedE2eXAxisMetric || '',
+      i_xmode: selectedXAxisMode,
       i_scale: scaleType,
       i_legend: isLegendExpanded ? '' : '0',
       i_advlabel: useAdvancedLabels ? '1' : '',
@@ -798,6 +837,7 @@ export function InferenceProvider({
       selectedYAxisMetric,
       selectedXAxisMetric,
       selectedE2eXAxisMetric,
+      selectedXAxisMode,
       scaleType,
       selectedGPUs,
       selectedDates,
@@ -968,6 +1008,8 @@ export function InferenceProvider({
       setSelectedXAxisMetric,
       selectedE2eXAxisMetric,
       setSelectedE2eXAxisMetric,
+      selectedXAxisMode,
+      setSelectedXAxisMode: handleSetXAxisMode,
       scaleType,
       setScaleType,
       loading,
@@ -1041,6 +1083,7 @@ export function InferenceProvider({
       selectedYAxisMetric,
       selectedXAxisMetric,
       selectedE2eXAxisMetric,
+      selectedXAxisMode,
       scaleType,
       selectedGPUs,
       selectedDates,
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index 7a39bbd1..3bbee596 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -532,6 +532,15 @@ export interface InferenceChartContextType {
   setSelectedXAxisMetric: (metric: string | null) => void;
   selectedE2eXAxisMetric: string | null;
   setSelectedE2eXAxisMetric: (metric: string | null) => void;
+  /**
+   * Which chart variant the user wants to see — the inference card shows one chart
+   * at a time, picked by the big TTFT / E2E Latency / Interactivity buttons.
+   * - 'ttft'          → e2e chartType with x-axis forced to p90_ttft
+   * - 'e2e'           → e2e chartType with the chart-config default x-axis (median_e2el / p90_e2el)
+   * - 'interactivity' → interactivity chartType (x = median_intvty / p90_intvty)
+   */
+  selectedXAxisMode: 'ttft' | 'e2e' | 'interactivity';
+  setSelectedXAxisMode: (mode: 'ttft' | 'e2e' | 'interactivity') => void;
   scaleType: 'auto' | 'linear' | 'log';
   setScaleType: (type: 'auto' | 'linear' | 'log') => void;
   setIsLegendExpanded: (metric: boolean) => void;
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index e9021aed..f0611274 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -2,7 +2,7 @@
 import { track } from '@/lib/analytics';
 import dynamic from 'next/dynamic';
 import { useMemo, useRef, useState } from 'react';
-import { BarChart3, ChevronDown, Table2, X } from 'lucide-react';
+import { BarChart3, Table2, X } from 'lucide-react';
 
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
 import { useInference } from '@/components/inference/InferenceContext';
@@ -30,7 +30,6 @@ import {
   DialogHeader,
   DialogTitle,
 } from '@/components/ui/dialog';
-import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover';
 import { Skeleton } from '@/components/ui/skeleton';
 import { useUnofficialRun } from '@/components/unofficial-run-provider';
 import {
@@ -60,54 +59,25 @@ const ModelArchitectureDiagram = dynamic(() => import('./ModelArchitectureDiagra
 });
 import WorkflowInfoDisplay from './WorkflowInfoDisplay';
 
-/** Controlled popover dropdown for the e2e chart x-axis toggle. */
-function E2eXAxisDropdown({
-  xAxisLabel,
-  xAxisOptions,
-  selectedValue,
-  onSelect,
-}: {
-  xAxisLabel: string;
-  xAxisOptions: { value: string | null; label: string }[];
-  selectedValue: string | null;
-  onSelect: (value: string | null) => void;
-}) {
-  const [open, setOpen] = useState(false);
-  return (
-    <Popover open={open} onOpenChange={setOpen}>
-      <PopoverTrigger asChild>
-        <button
-          className="inline-flex items-center gap-1 hover:opacity-70 transition-opacity cursor-pointer"
-          onClick={(e) => e.stopPropagation()}
-        >
-          vs. {xAxisLabel}
-          <ChevronDown className="no-export size-3.5 shrink-0 opacity-60" />
-        </button>
-      </PopoverTrigger>
-      <PopoverContent className="w-48 p-1" align="start">
-        {xAxisOptions.map((opt) => (
-          <button
-            key={opt.label}
-            className={`w-full text-left px-3 py-1.5 text-sm rounded hover:bg-accent transition-colors ${
-              (opt.value === null && !selectedValue) || opt.value === selectedValue
-                ? 'font-medium'
-                : ''
-            }`}
-            onClick={() => {
-              onSelect(opt.value);
-              setOpen(false);
-            }}
-          >
-            {opt.label}
-          </button>
-        ))}
-      </PopoverContent>
-    </Popover>
-  );
-}
-
 type InferenceViewMode = 'chart' | 'table';
 
+/**
+ * The three chart variants the user can choose with the big buttons above the
+ * chart card. Each maps to one entry in `inference-chart-config.json` plus a
+ * forced x-axis override for the E2E chartType.
+ */
+type XAxisMode = 'ttft' | 'e2e' | 'interactivity';
+
+interface XAxisModeButton {
+  value: XAxisMode;
+  label: string;
+}
+const X_AXIS_MODE_BUTTONS: XAxisModeButton[] = [
+  { value: 'ttft', label: 'TTFT' },
+  { value: 'e2e', label: 'E2E Latency' },
+  { value: 'interactivity', label: 'Interactivity' },
+];
+
 const VIEW_MODE_OPTIONS: SegmentedToggleOption<InferenceViewMode>[] = [
   {
     value: 'chart',
@@ -152,9 +122,10 @@ export default function ChartDisplay() {
     logScale,
     activeHwTypes,
     activeDates,
-    setSelectedE2eXAxisMetric,
     selectedPercentile,
     compareGpuPair,
+    selectedXAxisMode,
+    setSelectedXAxisMode,
   } = useInference();
 
   const {
@@ -329,17 +300,26 @@ export default function ChartDisplay() {
     }));
   }, [graphs, overlayDataByChartType, selectedModel, selectedSequence]);
 
+  // Show one chart at a time, picked by the TTFT / E2E / Interactivity buttons.
+  // Both 'ttft' and 'e2e' modes render the e2e chart (the x-axis swap is handled
+  // upstream by `selectedE2eXAxisMetric`, which `setSelectedXAxisMode` keeps in sync).
+  const visibleGraphs = useMemo(() => {
+    const wantedType = selectedXAxisMode === 'interactivity' ? 'interactivity' : 'e2e';
+    const filtered = effectiveGraphs.filter((g) => g.chartDefinition.chartType === wantedType);
+    return filtered.length > 0 ? filtered : effectiveGraphs;
+  }, [effectiveGraphs, selectedXAxisMode]);
+
   const displayGraphs = isFirstLoad
-    ? Array.from({ length: 2 }).map((_, index) => (
-        <Card key={`skeleton-${index}`}>
+    ? [
+        <Card key="skeleton-0">
           <Skeleton className="h-7 w-2/4 mb-1" />
           <Skeleton className="h-5 w-3/4 mb-2" />
           <Skeleton className="h-[600px] w-full" />
-        </Card>
-      ))
-    : effectiveGraphs.length === 0
+        </Card>,
+      ]
+    : visibleGraphs.length === 0
       ? []
-      : effectiveGraphs.map((graph, graphIndex) => {
+      : visibleGraphs.map((graph, graphIndex) => {
           const isTimelineMode = Boolean(
             selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
           );
@@ -415,43 +395,17 @@ export default function ChartDisplay() {
                               return 'vs. P90 Time To First Token';
                             }
 
-                            // For e2e chart: render clickable inline dropdown for x-axis
+                            // For e2e chart: heading is driven by the TTFT / E2E button
+                            // selection above the card, so the inline dropdown is gone.
                             if (graph.chartDefinition.chartType === 'e2e') {
                               const isAgentic = sequenceKind(selectedSequence) === 'agentic';
                               const pctlWord = selectedPercentile.toUpperCase();
-                              const e2elLabel = isAgentic
-                                ? `${pctlWord} End-to-end Latency`
-                                : 'End-to-end Latency';
-                              const xAxisLabel =
-                                selectedE2eXAxisMetric === 'p90_ttft' ? 'P90 TTFT' : e2elLabel;
-                              const xAxisOptions = [
-                                { value: null, label: e2elLabel },
-                                { value: 'p90_ttft', label: 'P90 TTFT' },
-                              ];
-                              const zoomPrefix =
-                                selectedDateRange.startDate &&
-                                selectedDateRange.endDate &&
-                                selectedGPUs.length > 0
-                                  ? 'gpu_timeseries'
-                                  : 'latency';
-                              return (
-                                <E2eXAxisDropdown
-                                  xAxisLabel={xAxisLabel}
-                                  xAxisOptions={xAxisOptions}
-                                  selectedValue={selectedE2eXAxisMetric}
-                                  onSelect={(value) => {
-                                    setSelectedE2eXAxisMetric(value);
-                                    track('latency_x_axis_metric_selected', {
-                                      metric: value ?? 'median_e2el',
-                                    });
-                                    window.dispatchEvent(
-                                      new CustomEvent(
-                                        `${zoomPrefix}_zoom_reset_chart-${graphIndex}`,
-                                      ),
-                                    );
-                                  }}
-                                />
-                              );
+                              if (selectedE2eXAxisMetric === 'p90_ttft') {
+                                return 'vs. P90 Time To First Token';
+                              }
+                              return isAgentic
+                                ? `vs. ${pctlWord} End-to-end Latency`
+                                : 'vs. End-to-end Latency';
                             }
 
                             // Fall back to the heading baked into chartDefinition
@@ -636,6 +590,36 @@ export default function ChartDisplay() {
           <CustomPowers loading={loading} />
         </section>
       )}
+      <section
+        className="flex flex-wrap justify-center gap-3 sm:gap-4"
+        role="tablist"
+        aria-label="Chart x-axis metric"
+        data-testid="x-axis-mode-buttons"
+      >
+        {X_AXIS_MODE_BUTTONS.map(({ value, label }) => {
+          const isActive = selectedXAxisMode === value;
+          return (
+            <button
+              key={value}
+              type="button"
+              role="tab"
+              aria-selected={isActive}
+              data-testid={`x-axis-mode-${value}`}
+              onClick={() => {
+                setSelectedXAxisMode(value);
+                track('latency_x_axis_mode_selected', { mode: value });
+              }}
+              className={`min-w-[160px] flex-1 sm:flex-initial rounded-full border-2 px-6 py-3 text-base font-semibold transition-colors ${
+                isActive
+                  ? 'border-primary bg-primary text-primary-foreground shadow-sm'
+                  : 'border-border bg-card text-foreground hover:border-primary/60 hover:bg-accent'
+              }`}
+            >
+              {label}
+            </button>
+          );
+        })}
+      </section>
       <div className="flex flex-col gap-4">{displayGraphs}</div>
 
       {/* Performance Over Time — Modal Drill-Down */}
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index 4a48a776..73cbe0b7 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -25,6 +25,7 @@ const URL_STATE_KEYS = [
   'i_pctl',
   'i_xmetric',
   'i_e2e_xmetric',
+  'i_xmode',
   'i_scale',
   'i_gpus',
   'i_dates',
@@ -70,6 +71,7 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   i_pctl: 'p90',
   i_xmetric: 'p90_ttft',
   i_e2e_xmetric: 'p90_ttft',
+  i_xmode: '',
   i_scale: 'auto',
   i_gpus: '',
   i_dates: '',

From 965c8622a36f02a6762388728c855da3ff2aa530 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 20:15:42 -0500
Subject: [PATCH 26/55] fix(inference): TTFT/E2E pick metric by sequence kind +
 add P75 option

Two related fixes for the x-axis-mode picker:

1. Fixed-seq has no p90_ttft / p90_e2el in the metrics JSONB (only
   median/p99). The TTFT button was hardcoded to p90_ttft, so the chart
   went blank on fixed-seq scenarios. Reconcile selectedE2eXAxisMetric in
   a reactive effect that picks median_ttft for fixed-seq and the user's
   selected percentile for agentic. useChartData's TTFT override now
   matches any *_ttft metric and derives its label from the actual
   percentile, instead of hardcoding "P90".

2. Add P75 to the agentic latency percentile selector. Update
   withPercentile + the label/heading regexes to handle p75 and p95.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../components/inference/InferenceContext.tsx | 23 +++++++++++++++++--
 .../inference/hooks/useChartData.ts           | 16 +++++++++----
 .../components/inference/ui/ChartDisplay.tsx  | 10 +++++---
 packages/app/src/lib/benchmark-transform.ts   |  2 +-
 packages/app/src/lib/data-mappings.ts         |  8 ++++---
 5 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 00ea316c..74bdb28b 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -150,8 +150,9 @@ export function InferenceProvider({
   const handleSetXAxisMode = useCallback((mode: 'ttft' | 'e2e' | 'interactivity') => {
     xAxisModeFromUrlRef.current = true;
     setSelectedXAxisMode(mode);
-    if (mode === 'ttft') setSelectedE2eXAxisMetric('p90_ttft');
-    else if (mode === 'e2e') setSelectedE2eXAxisMetric(null);
+    // The e2e chart's x-axis metric is reconciled in a separate effect below,
+    // because it depends on sequence kind (fixed-seq has no p90_* metrics) and
+    // the agentic percentile, both of which can change independently.
   }, []);
   // Latency percentile applied to the chart x-axis for agentic scenarios.
   // Values: 'p90' | 'p99'. Non-agentic charts ignore.
@@ -363,6 +364,24 @@ export function InferenceProvider({
     handleSetXAxisMode(kind === 'agentic' ? 'ttft' : 'interactivity');
   }, [effectiveSequence, handleSetXAxisMode]);
 
+  // Reconcile selectedE2eXAxisMetric whenever the mode, sequence kind, or
+  // agentic percentile changes. For fixed-seq the JSONB only carries
+  // median_* / p99_* (no p90_*), so the TTFT button there has to point at
+  // median_ttft — otherwise the chart goes blank. For agentic, we point at
+  // the user's chosen percentile so the dropdown actually drives the axis.
+  useEffect(() => {
+    const isAgentic = sequenceKind(effectiveSequence) === 'agentic';
+    if (selectedXAxisMode === 'ttft') {
+      setSelectedE2eXAxisMetric(isAgentic ? `${selectedPercentile}_ttft` : 'median_ttft');
+    } else if (selectedXAxisMode === 'e2e') {
+      // null = use the chart-config natural x (median_e2el), which useChartData
+      // rewrites to <pctl>_e2el for agentic via withPercentile().
+      setSelectedE2eXAxisMetric(null);
+    }
+    // 'interactivity' mode renders the interactivity chart, which keys off
+    // selectedXAxisMetric (not the e2e one), so nothing to do here.
+  }, [selectedXAxisMode, effectiveSequence, selectedPercentile]);
+
   // Ref guard: when true, filter changes don't clear the active preset.
   // FavoritePresetsDropdown sets this while applying a preset so its own
   // programmatic setter calls don't accidentally deactivate it.
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 0d13b8ca..ffa6a8a7 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -215,8 +215,16 @@ export function useChartData(
         // Resolve the effective x-axis override per chart type
         const effectiveXMetric =
           chartDef.chartType === 'e2e' ? selectedE2eXAxisMetric : selectedXAxisMetric;
-        const isTtftOverride = effectiveXMetric === 'p90_ttft';
-        const ttftLabel = 'P90 Time To First Token (s)';
+        // The TTFT override is now any *_ttft metric (not just p90_ttft) — the
+        // x-axis-mode picker reconciles the percentile prefix based on sequence
+        // kind (fixed-seq → median, agentic → user-picked percentile).
+        const isTtftOverride =
+          typeof effectiveXMetric === 'string' && effectiveXMetric.endsWith('_ttft');
+        const ttftPctl = isTtftOverride
+          ? (effectiveXMetric as string).replace(/_ttft$/u, '')
+          : 'p90';
+        const ttftPctlWord = ttftPctl === 'median' ? 'Median' : ttftPctl.toUpperCase();
+        const ttftLabel = `${ttftPctlWord} Time To First Token (s)`;
 
         const isAgentic = selectedSequence === Sequence.AgenticTraces;
 
@@ -261,9 +269,9 @@ export function useChartData(
             selectedPercentile,
           ) as keyof AggDataEntry;
           const pctlWord = selectedPercentile.toUpperCase();
-          xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
+          xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P75|P90|P95|P99(?:\.9)?)\b/iu, pctlWord);
           chartHeading = chartHeading.replace(
-            /^(vs\.\s+)(?:(Median|Mean|P90|P99(?:\.9)?)\s+)?/iu,
+            /^(vs\.\s+)(?:(Median|Mean|P75|P90|P95|P99(?:\.9)?)\s+)?/iu,
             `$1${pctlWord} `,
           );
         }
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index f0611274..ca7f9cd7 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -397,12 +397,16 @@ export default function ChartDisplay() {
 
                             // For e2e chart: heading is driven by the TTFT / E2E button
                             // selection above the card, so the inline dropdown is gone.
+                            // The metric carries the percentile prefix (e.g. p90_ttft,
+                            // median_ttft for fixed-seq, p75_ttft for agentic+p75).
                             if (graph.chartDefinition.chartType === 'e2e') {
                               const isAgentic = sequenceKind(selectedSequence) === 'agentic';
-                              const pctlWord = selectedPercentile.toUpperCase();
-                              if (selectedE2eXAxisMetric === 'p90_ttft') {
-                                return 'vs. P90 Time To First Token';
+                              if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
+                                const pctl = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
+                                const word = pctl === 'median' ? 'Median' : pctl.toUpperCase();
+                                return `vs. ${word} Time To First Token`;
                               }
+                              const pctlWord = selectedPercentile.toUpperCase();
                               return isAgentic
                                 ? `vs. ${pctlWord} End-to-end Latency`
                                 : 'vs. End-to-end Latency';
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index c5bdd6ed..ba26a978 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -136,7 +136,7 @@ interface PreparedEntry {
  * percentile prefix; leaves everything else alone.
  */
 export function withPercentile(key: string, percentile: string): string {
-  return key.replace(/^(mean|median|p90|p99|p99\.9)_/, `${percentile}_`);
+  return key.replace(/^(mean|median|p75|p90|p95|p99|p99\.9)_/u, `${percentile}_`);
 }
 
 /**
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 91f65a34..c18266ba 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -185,15 +185,17 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 
 /**
  * Percentile of the latency distribution used for the chart x-axis when
- * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
- * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only p90
- * is surfaced in the UI.
+ * viewing agentic traces. Agentic rows carry median/p75/p90/p95/p99/p99.9
+ * variants for ttft, ttlt (=e2el), and itl (and intvty derived from itl);
+ * p75 and p90 are surfaced in the UI.
  */
 export enum Percentile {
+  P75 = 'p75',
   P90 = 'p90',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
+  [Percentile.P75]: { label: 'p75' },
   [Percentile.P90]: { label: 'p90' },
 };
 

From e4d97f29bb3ff3a973a7b84113dc61278f70abf8 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 23:18:17 -0500
Subject: [PATCH 27/55] feat(metrics): wire P75/P95 through frontend + register
 new aiperf keys

The P75 percentile option I just added was broken: rowToAggDataEntry
only copied median/mean/p90/p99/p99.9 from the metrics JSONB, so the
chart looked up entry.p75_ttft which didn't exist and points fell to 0.

- Add p75_*/p95_* fields for ttft/tpot/itl/e2el/intvty to AggDataEntry
  and rowToAggDataEntry so the existing percentile pipeline can resolve them.
- Update the energy-metrics test fixture for the new required fields.
- Register all new aiperf metric keys (p75/p95 latencies, qps stats,
  per-request token-count distribution, run totals, server cache hit rates,
  total/input/output tput_tps) in METRIC_KEYS so the ingest auto-capture
  warning stops firing on the next agentic run.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../app/src/components/inference/types.ts     | 10 +++
 packages/app/src/lib/benchmark-transform.ts   | 10 +++
 packages/app/src/lib/energy-metrics.test.ts   | 10 +++
 packages/constants/src/metric-keys.ts         | 66 ++++++++++++++++++-
 4 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index 3bbee596..0a9908e3 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -52,7 +52,9 @@ export interface AggDataEntry {
   mean_ttft: number;
   median_ttft: number;
   std_ttft: number;
+  p75_ttft: number;
   p90_ttft: number;
+  p95_ttft: number;
   p99_ttft: number;
   'p99.9_ttft': number;
   mean_tpot: number;
@@ -61,8 +63,12 @@ export interface AggDataEntry {
   median_intvty: number;
   std_tpot: number;
   std_intvty: number;
+  p75_tpot: number;
+  p75_intvty: number;
   p90_tpot: number;
   p90_intvty: number;
+  p95_tpot: number;
+  p95_intvty: number;
   p99_tpot: number;
   p99_intvty: number;
   'p99.9_tpot': number;
@@ -70,13 +76,17 @@ export interface AggDataEntry {
   mean_itl: number;
   median_itl: number;
   std_itl: number;
+  p75_itl: number;
   p90_itl: number;
+  p95_itl: number;
   p99_itl: number;
   'p99.9_itl': number;
   mean_e2el: number;
   median_e2el: number;
   std_e2el: number;
+  p75_e2el: number;
   p90_e2el: number;
+  p95_e2el: number;
   p99_e2el: number;
   'p99.9_e2el': number;
   disagg: boolean;
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index ba26a978..3594750c 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -64,31 +64,41 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
     mean_ttft: m.mean_ttft ?? 0,
     median_ttft: m.median_ttft ?? 0,
     std_ttft: m.std_ttft ?? 0,
+    p75_ttft: m.p75_ttft ?? 0,
     p90_ttft: m.p90_ttft ?? 0,
+    p95_ttft: m.p95_ttft ?? 0,
     p99_ttft: m.p99_ttft ?? 0,
     'p99.9_ttft': m['p99.9_ttft'] ?? 0,
     mean_tpot: m.mean_tpot ?? 0,
     median_tpot: m.median_tpot ?? 0,
     std_tpot: m.std_tpot ?? 0,
+    p75_tpot: m.p75_tpot ?? 0,
     p90_tpot: m.p90_tpot ?? 0,
+    p95_tpot: m.p95_tpot ?? 0,
     p99_tpot: m.p99_tpot ?? 0,
     'p99.9_tpot': m['p99.9_tpot'] ?? 0,
     mean_intvty: m.mean_intvty ?? 0,
     median_intvty: m.median_intvty ?? 0,
     std_intvty: m.std_intvty ?? 0,
+    p75_intvty: m.p75_intvty ?? 0,
     p90_intvty: m.p90_intvty ?? 0,
+    p95_intvty: m.p95_intvty ?? 0,
     p99_intvty: m.p99_intvty ?? 0,
     'p99.9_intvty': m['p99.9_intvty'] ?? 0,
     mean_itl: m.mean_itl ?? 0,
     median_itl: m.median_itl ?? 0,
     std_itl: m.std_itl ?? 0,
+    p75_itl: m.p75_itl ?? 0,
     p90_itl: m.p90_itl ?? 0,
+    p95_itl: m.p95_itl ?? 0,
     p99_itl: m.p99_itl ?? 0,
     'p99.9_itl': m['p99.9_itl'] ?? 0,
     mean_e2el: m.mean_e2el ?? 0,
     median_e2el: m.median_e2el ?? 0,
     std_e2el: m.std_e2el ?? 0,
+    p75_e2el: m.p75_e2el ?? 0,
     p90_e2el: m.p90_e2el ?? 0,
+    p95_e2el: m.p95_e2el ?? 0,
     p99_e2el: m.p99_e2el ?? 0,
     'p99.9_e2el': m['p99.9_e2el'] ?? 0,
     disagg: row.disagg,
diff --git a/packages/app/src/lib/energy-metrics.test.ts b/packages/app/src/lib/energy-metrics.test.ts
index 54788585..2f5844c1 100644
--- a/packages/app/src/lib/energy-metrics.test.ts
+++ b/packages/app/src/lib/energy-metrics.test.ts
@@ -57,7 +57,9 @@ function makeEntry(overrides: Partial<AggDataEntry> = {}): AggDataEntry {
     mean_ttft: 0.5,
     median_ttft: 0.4,
     std_ttft: 0.1,
+    p75_ttft: 0.65,
     p90_ttft: 0.7,
+    p95_ttft: 0.75,
     p99_ttft: 0.8,
     'p99.9_ttft': 0.9,
     mean_tpot: 0.02,
@@ -66,8 +68,12 @@ function makeEntry(overrides: Partial<AggDataEntry> = {}): AggDataEntry {
     median_intvty: 44,
     std_tpot: 0.005,
     std_intvty: 5,
+    p75_tpot: 0.022,
+    p75_intvty: 50,
     p90_tpot: 0.025,
     p90_intvty: 55,
+    p95_tpot: 0.028,
+    p95_intvty: 58,
     p99_tpot: 0.03,
     p99_intvty: 60,
     'p99.9_tpot': 0.035,
@@ -75,13 +81,17 @@ function makeEntry(overrides: Partial<AggDataEntry> = {}): AggDataEntry {
     mean_itl: 0.01,
     median_itl: 0.01,
     std_itl: 0.002,
+    p75_itl: 0.012,
     p90_itl: 0.013,
+    p95_itl: 0.014,
     p99_itl: 0.015,
     'p99.9_itl': 0.018,
     mean_e2el: 5,
     median_e2el: 4.8,
     std_e2el: 0.5,
+    p75_e2el: 5.2,
     p90_e2el: 5.5,
+    p95_e2el: 5.8,
     p99_e2el: 6,
     'p99.9_e2el': 6.5,
     disagg: false,
diff --git a/packages/constants/src/metric-keys.ts b/packages/constants/src/metric-keys.ts
index cf2c4d0b..70e50f96 100644
--- a/packages/constants/src/metric-keys.ts
+++ b/packages/constants/src/metric-keys.ts
@@ -1,46 +1,110 @@
 /**
  * Canonical set of metric keys stored in the benchmark_results.metrics JSONB column.
  *
- * All values are in seconds unless noted otherwise. Throughput values are tokens/sec/GPU.
+ * Latency values (ttft/tpot/itl/e2el/intvty) are in seconds. Throughput values are
+ * tokens/sec — `_per_gpu` is per-GPU, `_tps` is total tokens/sec across the deployment.
+ *
+ * Distribution stats (mean/median/std/p75/p90/p95/p99/p99.9) are present for latency,
+ * QPS, and per-request token counts; agentic runs carry the full set, fixed-seq runs
+ * carry median/mean/p99/std for latency only.
  */
 export const METRIC_KEYS = new Set([
   // throughput (tokens/sec/GPU)
   'tput_per_gpu',
   'output_tput_per_gpu',
   'input_tput_per_gpu',
+  // throughput (tokens/sec, deployment total) — agentic aiperf reports both
+  'total_tput_tps',
+  'output_tput_tps',
+  'input_tput_tps',
   // TTFT — time to first token
   'median_ttft',
   'mean_ttft',
+  'p75_ttft',
   'p90_ttft',
+  'p95_ttft',
   'p99_ttft',
   'p99.9_ttft',
   'std_ttft',
   // TPOT — time per output token
   'median_tpot',
   'mean_tpot',
+  'p75_tpot',
   'p90_tpot',
+  'p95_tpot',
   'p99_tpot',
   'p99.9_tpot',
   'std_tpot',
   // ITL — inter-token latency
   'median_itl',
   'mean_itl',
+  'p75_itl',
   'p90_itl',
+  'p95_itl',
   'p99_itl',
   'p99.9_itl',
   'std_itl',
   // E2EL — end-to-end latency
   'median_e2el',
   'mean_e2el',
+  'p75_e2el',
   'p90_e2el',
+  'p95_e2el',
   'p99_e2el',
   'p99.9_e2el',
   'std_e2el',
   // interactivity
   'median_intvty',
   'mean_intvty',
+  'p75_intvty',
   'p90_intvty',
+  'p95_intvty',
   'p99_intvty',
   'p99.9_intvty',
   'std_intvty',
+  // QPS — queries per second (agentic aiperf)
+  'median_qps',
+  'mean_qps',
+  'p75_qps',
+  'p90_qps',
+  'p95_qps',
+  'p99_qps',
+  'p99.9_qps',
+  'std_qps',
+  // per-request input token count distribution
+  'median_input_tokens',
+  'mean_input_tokens',
+  'p75_input_tokens',
+  'p90_input_tokens',
+  'p95_input_tokens',
+  'p99_input_tokens',
+  'p99.9_input_tokens',
+  'std_input_tokens',
+  // per-request output token count distribution — actual served
+  'median_output_tokens_actual',
+  'mean_output_tokens_actual',
+  'p75_output_tokens_actual',
+  'p90_output_tokens_actual',
+  'p95_output_tokens_actual',
+  'p99_output_tokens_actual',
+  'p99.9_output_tokens_actual',
+  'std_output_tokens_actual',
+  // per-request output token count distribution — expected from trace
+  'median_output_tokens_expected',
+  'mean_output_tokens_expected',
+  'p75_output_tokens_expected',
+  'p90_output_tokens_expected',
+  'p95_output_tokens_expected',
+  'p99_output_tokens_expected',
+  'p99.9_output_tokens_expected',
+  'std_output_tokens_expected',
+  // run totals (agentic aiperf)
+  'duration_seconds',
+  'total_requests_completed',
+  'total_prompt_tokens',
+  'total_generation_tokens',
+  // server prefix-cache observability (agentic aiperf)
+  'server_gpu_cache_hit_rate',
+  'server_cpu_cache_hit_rate',
+  'theoretical_cache_hit_rate',
 ]);

From a7a135401f18ad2c24f6c87b25a1a255826309db Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 23:20:54 -0500
Subject: [PATCH 28/55] fix(inference): don't drop agentic TTFT points over 60s
 as outliers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

useChartData hardcoded a 60s latency-limit filter when xAxisField was
'p90_ttft' — meant to suppress fixed-seq overload outliers (conc=2048
rows that compress the rest of the chart to the left). For agentic
runs, TTFTs > 60s are normal (long prompts, multi-turn) so the filter
hid legitimate data points (e.g. only 7/12 visible for the latest B200
DSV4 ingest).

- Skip the latency-limit filter for agentic scenarios in both
  useChartData and processOverlayChartData.
- Broaden the TTFT-override detection from `=== 'p90_ttft'` to any
  `*_ttft` so the new median/p75/p99 percentile picks behave the same.
- Pass isAgentic into processOverlayChartData from ChartDisplay so the
  unofficial-run overlay path matches the official one.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../components/inference/hooks/useChartData.ts   | 10 +++++++---
 .../src/components/inference/ui/ChartDisplay.tsx |  1 +
 packages/app/src/components/inference/utils.ts   | 16 +++++++++++++---
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index ffa6a8a7..2557b0d8 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -350,7 +350,8 @@ export function useChartData(
 
         // Filter to points that have the selected metric, then remap x/y
         const hasMetric = filteredData.some((d) => metricKey in d);
-        const isTtftX = xAxisField === 'p90_ttft';
+        const isTtftX = typeof xAxisField === 'string' && xAxisField.endsWith('_ttft');
+        const isAgentic = selectedSequence === Sequence.AgenticTraces;
         const processedData = hasMetric
           ? filteredData
               .filter((d) => metricKey in d)
@@ -365,11 +366,14 @@ export function useChartData(
                   roof,
                 };
               })
-              // When TTFT is on the x-axis, apply the latency limit to filter overload outliers
-              // (e.g. conc=2048 rows with TTFT > 60s that compress all real data to the far left)
+              // When TTFT is on the x-axis, apply the latency limit to filter
+              // overload outliers (fixed-seq conc=2048 rows with TTFT > 60s that
+              // compress all real data to the far left). Skip for agentic — long
+              // TTFTs there reflect real workloads (multi-turn, big prompts).
               .filter(
                 (d) =>
                   !isTtftX ||
+                  isAgentic ||
                   !chartDefinition.y_latency_limit ||
                   d.x <= chartDefinition.y_latency_limit,
               )
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index ca7f9cd7..12f9f5de 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -183,6 +183,7 @@ export default function ChartDisplay() {
         chartType,
         selectedYAxisMetric,
         effectiveXMetric,
+        { isAgentic: sequenceKind(selectedSequence) === 'agentic' },
       );
 
       let overlayPoints = processed;
diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts
index 735007ab..4876c614 100644
--- a/packages/app/src/components/inference/utils.ts
+++ b/packages/app/src/components/inference/utils.ts
@@ -75,11 +75,13 @@ export function processOverlayChartData(
   chartType: 'e2e' | 'interactivity',
   selectedYAxisMetric: string,
   selectedXAxisMetric: string | null,
+  options?: { isAgentic?: boolean },
 ): InferenceData[] {
   const chartDef = (chartDefinitions as ChartDefinition[]).find((d) => d.chartType === chartType);
   if (!chartDef) return [];
 
   const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
+  const isAgentic = options?.isAgentic === true;
 
   // Resolve x-axis field (must match useChartData logic)
   const metricTitle =
@@ -87,8 +89,11 @@ export function processOverlayChartData(
   const isInputMetric = metricTitle.toLowerCase().includes('input');
   let xAxisField: string = chartDef.x;
   // selectedXAxisMetric is already the effective metric for this chart type
-  // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric)
-  const isTtftOverride = selectedXAxisMetric === 'p90_ttft';
+  // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric).
+  // Match any *_ttft metric — the x-axis-mode picker can now select any
+  // percentile (median/p75/p90/p99) depending on sequence kind.
+  const isTtftOverride =
+    typeof selectedXAxisMetric === 'string' && selectedXAxisMetric.endsWith('_ttft');
 
   if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
     xAxisField = selectedXAxisMetric;
@@ -108,7 +113,12 @@ export function processOverlayChartData(
     })
     .filter(
       (d) =>
-        xAxisField === chartDef.x || !chartDef.y_latency_limit || d.x <= chartDef.y_latency_limit,
+        // Skip the latency limit for the natural x-axis or for agentic
+        // (long TTFTs are normal there, not overload outliers).
+        xAxisField === chartDef.x ||
+        isAgentic ||
+        !chartDef.y_latency_limit ||
+        d.x <= chartDef.y_latency_limit,
     );
 
   return filterDataByCostLimit(processedData, chartDef, selectedYAxisMetric);

From 07194de6e5df1ca75d1f35085d178a2dc2625493 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 23:42:53 -0500
Subject: [PATCH 29/55] fix(trace-histograms): chunk DB query + blob-cache to
 escape size caps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Loading trace histograms for ~30+ agentic points failed with HTTP 500
because the Neon serverless HTTP driver caps responses at 64 MB, and
each compressed profile_export.jsonl blob is ~1-2 MB — the JOIN
returned all matching blobs in one round-trip and blew the cap. With no
histogram data, the "View charts" button never appears on the tooltip,
so users couldn't open the per-point detail page after the latest run.

- Chunk getTraceHistograms to 12 IDs per query so each round-trip stays
  well under the 64 MB cap. Total payload still merged into one map.
- Switch the route's cachedQuery to blobOnly so the larger JSON
  response doesn't bump the Next.js unstable_cache 2 MB limit either.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../src/app/api/v1/trace-histograms/route.ts  |  5 +++
 packages/db/src/queries/trace-histograms.ts   | 31 +++++++++++++------
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/app/api/v1/trace-histograms/route.ts b/packages/app/src/app/api/v1/trace-histograms/route.ts
index fd7572a8..7a959a65 100644
--- a/packages/app/src/app/api/v1/trace-histograms/route.ts
+++ b/packages/app/src/app/api/v1/trace-histograms/route.ts
@@ -10,9 +10,14 @@ import { cachedJson, cachedQuery } from '@/lib/api-cache';
 
 export const dynamic = 'force-dynamic';
 
+// blobOnly: a 50-id histogram payload can easily exceed Next.js's 2MB
+// unstable_cache limit (each point carries one int per request, ~500-1000+
+// requests for agentic), which manifests as a 500 from the route. Blob
+// storage lets us cache the larger response without losing the warm-cache hit.
 const getCachedTraceHistograms = cachedQuery(
   (ids: number[]): Promise<TraceHistogramMap> => getTraceHistograms(getDb(), ids),
   'trace-histograms',
+  { blobOnly: true },
 );
 
 const MAX_IDS_PER_REQUEST = 200;
diff --git a/packages/db/src/queries/trace-histograms.ts b/packages/db/src/queries/trace-histograms.ts
index c243afd8..20ebc0d5 100644
--- a/packages/db/src/queries/trace-histograms.ts
+++ b/packages/db/src/queries/trace-histograms.ts
@@ -27,21 +27,34 @@ export interface TraceHistogramPoint {
 
 export type TraceHistogramMap = Record<number, TraceHistogramPoint>;
 
+/**
+ * Cap the number of blobs we pull in a single Neon HTTP query — the serverless
+ * driver returns 507 ("response is too large, max 64 MB") if the combined gzip
+ * payload exceeds that. Each profile_export.jsonl blob can be ~1-2 MB
+ * compressed, so we stay well below the cap at 12.
+ */
+const QUERY_CHUNK_SIZE = 12;
+
 export async function getTraceHistograms(
   sql: DbClient,
   benchmarkResultIds: number[],
 ): Promise<TraceHistogramMap> {
   if (benchmarkResultIds.length === 0) return {};
 
-  const rows = (await sql`
-    select
-      br.id as benchmark_result_id,
-      atr.profile_export_jsonl_gz as blob
-    from benchmark_results br
-    join agentic_trace_replay atr on atr.id = br.trace_replay_id
-    where br.id = any(${benchmarkResultIds}::bigint[])
-      and atr.profile_export_jsonl_gz is not null
-  `) as { benchmark_result_id: number; blob: Buffer }[];
+  const rows: { benchmark_result_id: number; blob: Buffer }[] = [];
+  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
+    const chunkRows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.profile_export_jsonl_gz as blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+        and atr.profile_export_jsonl_gz is not null
+    `) as { benchmark_result_id: number; blob: Buffer }[];
+    rows.push(...chunkRows);
+  }
 
   const result: TraceHistogramMap = {};
   for (const row of rows) {

From a1e594b34a8faa181af01e6c8449498eafa7e086 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 23:48:54 -0500
Subject: [PATCH 30/55] feat(inference): run selector actually filters chart
 data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When two workflow runs land on the same date (e.g. re-ingesting a
config), the run picker's "Run 1/2" ↔ "Run 2/2" had no effect on the
chart — benchmarks API returned DISTINCT ON (config, conc, isl, osl)
ordered by date with no run tiebreaker, so Postgres arbitrarily picked
one row per config and both picker selections produced identical data.

Plumb runId through the request path:
- getLatestBenchmarks gets an optional runId branch that strictly
  scopes to one workflow_run (filter wr.github_run_id = $runId).
- /api/v1/benchmarks accepts ?runId=…, forwarded into the cached query
  so each run has its own blob-cache entry.
- fetchBenchmarks → benchmarkQueryOptions → useBenchmarks pass the
  runId through; React Query keys it for separate caches per run.
- useChartData accepts selectedRunId and forwards it.
- InferenceProvider only passes runId when the current date has >1
  runs — single-run dates keep the existing latest-per-config logic
  so configs from earlier dates remain visible.

Verified in the dashboard: switching Run 1/2 ↔ Run 2/2 fires distinct
requests with the correct runId and the chart re-renders per-run.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../src/app/api/v1/benchmarks/route.test.ts   | 24 +++++++++-
 .../app/src/app/api/v1/benchmarks/route.ts    |  7 +--
 .../components/inference/InferenceContext.tsx |  9 ++++
 .../inference/hooks/useChartData.ts           | 11 ++++-
 .../app/src/hooks/api/use-benchmarks.test.ts  | 21 +++++++-
 packages/app/src/hooks/api/use-benchmarks.ts  | 10 ++--
 packages/app/src/lib/api.ts                   |  3 ++
 packages/db/src/queries/benchmarks.ts         | 48 +++++++++++++++++++
 8 files changed, 122 insertions(+), 11 deletions(-)

diff --git a/packages/app/src/app/api/v1/benchmarks/route.test.ts b/packages/app/src/app/api/v1/benchmarks/route.test.ts
index 780f775e..92d5f326 100644
--- a/packages/app/src/app/api/v1/benchmarks/route.test.ts
+++ b/packages/app/src/app/api/v1/benchmarks/route.test.ts
@@ -59,6 +59,7 @@ describe('GET /api/v1/benchmarks', () => {
       ['dsr1'],
       undefined,
       undefined,
+      undefined,
     );
   });
 
@@ -72,6 +73,7 @@ describe('GET /api/v1/benchmarks', () => {
       ['dsr1'],
       '2026-03-01',
       undefined,
+      undefined,
     );
   });
 
@@ -82,7 +84,27 @@ describe('GET /api/v1/benchmarks', () => {
       req('/api/v1/benchmarks?model=DeepSeek-R1-0528&date=2026-03-01&exact=true'),
     );
     expect(res.status).toBe(200);
-    expect(mockGetLatestBenchmarks).toHaveBeenCalledWith('mock-sql', ['dsr1'], '2026-03-01', true);
+    expect(mockGetLatestBenchmarks).toHaveBeenCalledWith(
+      'mock-sql',
+      ['dsr1'],
+      '2026-03-01',
+      true,
+      undefined,
+    );
+  });
+
+  it('passes runId param to query when provided', async () => {
+    mockGetLatestBenchmarks.mockResolvedValueOnce([]);
+
+    const res = await GET(req('/api/v1/benchmarks?model=DeepSeek-R1-0528&runId=26194160120'));
+    expect(res.status).toBe(200);
+    expect(mockGetLatestBenchmarks).toHaveBeenCalledWith(
+      'mock-sql',
+      ['dsr1'],
+      undefined,
+      undefined,
+      '26194160120',
+    );
   });
 
   it('returns 500 when query throws', async () => {
diff --git a/packages/app/src/app/api/v1/benchmarks/route.ts b/packages/app/src/app/api/v1/benchmarks/route.ts
index c79f1aa7..c4037208 100644
--- a/packages/app/src/app/api/v1/benchmarks/route.ts
+++ b/packages/app/src/app/api/v1/benchmarks/route.ts
@@ -11,10 +11,10 @@ import { loadFixture } from '@/lib/test-fixtures';
 export const dynamic = 'force-dynamic';
 
 const getCachedBenchmarks = cachedQuery(
-  (dbModelKeys: string[], date?: string, exact?: boolean) => {
+  (dbModelKeys: string[], date?: string, exact?: boolean, runId?: string) => {
     if (JSON_MODE)
       return Promise.resolve(jsonProvider.getLatestBenchmarks(dbModelKeys, date, exact));
-    return getLatestBenchmarks(getDb(), dbModelKeys, date, exact);
+    return getLatestBenchmarks(getDb(), dbModelKeys, date, exact, runId);
   },
   'benchmarks',
   { blobOnly: true },
@@ -25,6 +25,7 @@ export async function GET(request: NextRequest) {
   const model = params.get('model') ?? '';
   const date = params.get('date') ?? undefined;
   const exact = params.get('exact') === 'true';
+  const runId = params.get('runId') ?? undefined;
   const dbModelKeys = DISPLAY_MODEL_TO_DB[model];
   if (!dbModelKeys || dbModelKeys.length === 0) {
     return NextResponse.json({ error: 'Unknown model' }, { status: 400 });
@@ -32,7 +33,7 @@ export async function GET(request: NextRequest) {
   if (FIXTURES_MODE) return cachedJson(loadFixture('benchmarks'));
 
   try {
-    const rows = await getCachedBenchmarks(dbModelKeys, date, exact || undefined);
+    const rows = await getCachedBenchmarks(dbModelKeys, date, exact || undefined, runId);
     return cachedJson(rows);
   } catch (error) {
     console.error('Error fetching benchmarks:', error);
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 74bdb28b..edf0974e 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -214,6 +214,14 @@ export function InferenceProvider({
   // ── Data fetching (gated by isActive) ──────────────────────────────────────
   const latestDate = availableDates.length > 0 ? availableDates.at(-1) : undefined;
 
+  // Run-selector scoping: only constrain benchmark data to a specific run when
+  // the current date has >1 runs (ambiguous case). When there's one run per
+  // date, the picker is informational and the SQL's latest-per-config logic
+  // already returns that run's data — passing runId would needlessly narrow
+  // the cross-date config view.
+  const multipleRunsOnDate = availableRuns && Object.keys(availableRuns).length > 1;
+  const benchmarkRunId = multipleRunsOnDate && selectedRunId ? String(selectedRunId) : undefined;
+
   const {
     graphs,
     loading: chartDataLoading,
@@ -236,6 +244,7 @@ export function InferenceProvider({
     latestDate,
     selectedPercentile,
     compareGpuPair ?? null,
+    benchmarkRunId,
   );
 
   // For GPU comparison date picker — use shared availability data from global filters
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 2557b0d8..328750f0 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -86,10 +86,19 @@ export function useChartData(
   selectedPercentile = 'p90',
   /** When set, only series for these two registry GPU keys are shown (compare pages). */
   compareGpuPair?: readonly [string, string] | null,
+  /**
+   * GitHub run id (g_runid) from the run picker. When set, the benchmarks API
+   * scopes results to that workflow run instead of returning the latest per
+   * config — disambiguates when two runs land on the same date.
+   */
+  selectedRunId?: string,
 ) {
   // When the selected date is the latest available, use '' (empty string) to match
   // the initial no-date query key, reusing the eagerly-fetched benchmarks from the
   // materialized view instead of firing a redundant second fetch with identical data.
+  // When a specific run is selected, we always go through the runId branch and the
+  // date is effectively ignored — keep queryDate set so React Query still has a
+  // distinct cache key per date if the user navigates back to "latest".
   const queryDate =
     selectedRunDate && latestAvailableDate && selectedRunDate === latestAvailableDate
       ? ''
@@ -99,7 +108,7 @@ export function useChartData(
     data: allRows,
     isLoading: queryLoading,
     error: queryError,
-  } = useBenchmarks(selectedModel, queryDate, enabled);
+  } = useBenchmarks(selectedModel, queryDate, enabled, selectedRunId);
 
   // GPU comparison: fetch data for each additional comparison date
   const comparisonDates = useMemo(
diff --git a/packages/app/src/hooks/api/use-benchmarks.test.ts b/packages/app/src/hooks/api/use-benchmarks.test.ts
index 7329896d..c4f49130 100644
--- a/packages/app/src/hooks/api/use-benchmarks.test.ts
+++ b/packages/app/src/hooks/api/use-benchmarks.test.ts
@@ -5,12 +5,29 @@ import { benchmarkQueryOptions } from '@/hooks/api/use-benchmarks';
 describe('benchmarkQueryOptions', () => {
   it('builds query key from model and date', () => {
     const opts = benchmarkQueryOptions('DeepSeek-R1-0528', '2026-03-01');
-    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'latest']);
+    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'latest', '']);
   });
 
   it('builds exact query key when exact=true', () => {
     const opts = benchmarkQueryOptions('DeepSeek-R1-0528', '2026-03-01', true, true);
-    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'exact']);
+    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'exact', '']);
+  });
+
+  it('includes runId in query key when provided', () => {
+    const opts = benchmarkQueryOptions(
+      'DeepSeek-R1-0528',
+      '2026-03-01',
+      true,
+      false,
+      '26194160120',
+    );
+    expect(opts.queryKey).toEqual([
+      'benchmarks',
+      'DeepSeek-R1-0528',
+      '2026-03-01',
+      'latest',
+      '26194160120',
+    ]);
   });
 
   it('produces distinct keys for different models', () => {
diff --git a/packages/app/src/hooks/api/use-benchmarks.ts b/packages/app/src/hooks/api/use-benchmarks.ts
index 6da1568e..8fd1f4e9 100644
--- a/packages/app/src/hooks/api/use-benchmarks.ts
+++ b/packages/app/src/hooks/api/use-benchmarks.ts
@@ -8,14 +8,16 @@ export function benchmarkQueryOptions(
   date: string,
   enabled = true,
   exact?: boolean,
+  runId?: string,
 ) {
   return {
-    queryKey: ['benchmarks', model, date, exact ? 'exact' : 'latest'] as const,
-    queryFn: ({ signal }: { signal: AbortSignal }) => fetchBenchmarks(model, date, exact, signal),
+    queryKey: ['benchmarks', model, date, exact ? 'exact' : 'latest', runId ?? ''] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) =>
+      fetchBenchmarks(model, date, exact, signal, runId),
     enabled: enabled && Boolean(model),
   };
 }
 
-export function useBenchmarks(model: string, date?: string, enabled = true) {
-  return useQuery(benchmarkQueryOptions(model, date ?? 'latest', enabled));
+export function useBenchmarks(model: string, date?: string, enabled = true, runId?: string) {
+  return useQuery(benchmarkQueryOptions(model, date ?? 'latest', enabled, undefined, runId));
 }
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
index 98587c2f..31cf906a 100644
--- a/packages/app/src/lib/api.ts
+++ b/packages/app/src/lib/api.ts
@@ -121,10 +121,13 @@ export function fetchBenchmarks(
   date?: string,
   exact?: boolean,
   signal?: AbortSignal,
+  /** Optional github_run_id to scope to a specific workflow run. */
+  runId?: string,
 ) {
   const params = new URLSearchParams({ model });
   if (date) params.set('date', date);
   if (exact) params.set('exact', 'true');
+  if (runId) params.set('runId', runId);
   return fetchJson<BenchmarkRow[]>(`/api/v1/benchmarks?${params}`, signal);
 }
 
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 36bb0e65..2291dc0c 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -53,8 +53,56 @@ export async function getLatestBenchmarks(
   modelKey: string | string[],
   date?: string,
   exact?: boolean,
+  /**
+   * If set, filter to a specific GitHub Actions workflow run.
+   * Bypasses the "latest per config" logic — when two runs landed on the same
+   * date and the user picked one in the run selector, this scopes the chart
+   * data to that run only. Value matches the URL param `g_runid` (a
+   * stringified github_run_id, not the DB id).
+   */
+  runId?: string,
 ): Promise<BenchmarkRow[]> {
   const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey];
+  if (runId) {
+    const rows = await sql`
+      SELECT
+        br.id,
+        c.hardware,
+        c.framework,
+        c.model,
+        c.precision,
+        c.spec_method,
+        c.disagg,
+        c.is_multinode,
+        c.prefill_tp,
+        c.prefill_ep,
+        c.prefill_dp_attention,
+        c.prefill_num_workers,
+        c.decode_tp,
+        c.decode_ep,
+        c.decode_dp_attention,
+        c.decode_num_workers,
+        c.num_prefill_gpu,
+        c.num_decode_gpu,
+        br.benchmark_type,
+        br.offload_mode,
+        br.isl,
+        br.osl,
+        br.conc,
+        br.image,
+        br.metrics,
+        br.date::text,
+        CASE WHEN wr.html_url IS NOT NULL THEN wr.html_url || '/attempts/' || wr.run_attempt ELSE NULL END AS run_url
+      FROM benchmark_results br
+      JOIN configs c ON c.id = br.config_id
+      JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id
+      WHERE c.model = ANY(${modelKeys})
+        AND br.error IS NULL
+        AND wr.github_run_id = ${runId}::bigint
+      ORDER BY br.config_id, br.conc, br.isl, br.osl
+    `;
+    return rows as unknown as BenchmarkRow[];
+  }
   if (date) {
     // Date-filtered: use base table with DISTINCT ON (the view only has the absolute latest)
     // exact=true: only return data from this exact date (for GPU comparison)

From b0d228abeb344aa2ced0e2c5ab2ac43e0128a17e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 00:11:34 -0500
Subject: [PATCH 31/55] feat(inference): Session Time + Prefill TPS x-axis
 (live from trace blobs)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds two new agentic-only chart variants per
https://gist.github.com/xinli-sw/115d370c17f6d1b977878b68530981fa, computed
live from the stored aiperf profile_export.jsonl blobs (no backfill needed):

- Session Time: mean across sessions of Σ per-turn request_latency,
  rescaled by mean_load / session_load. The summed-latency definition
  inherently strips inter-turn tool/thinking gaps (only GPU active time
  contributes).
- Prefill TPS / user: per turn ISL / TTFT, P90 across the session's turns,
  mean across sessions. Captures worst-turn prefill responsiveness.

The buttons only show on agentic scenarios (gated by a mounted flag to
keep SSR identical to the first client render). Roofline corners match the
expected Pareto direction: Session Time sweeps bottom-left → top-right;
Prefill TPS sweeps top-left → bottom-right.

Plumbing:
- New `getDerivedAgenticMetrics(sql, ids)` in packages/db chunks JSONL
  blob loads to 6 per query so we stay under Neon's 64 MB cap. Includes
  5-case unit suite for the math.
- New `/api/v1/derived-agentic-metrics` route + `useDerivedAgenticMetrics`
  hook, mirroring trace-histograms (blob-cached).
- ChartDisplay fetches derived metrics for visible agentic point IDs and
  overrides scatter data.x + chart heading + axis label + roofline corner.

Two side-effects fixed along the way:
- Hydration mismatch from URL-driven initial state: x-axis-mode now seeds
  from a fixed default and applies the URL value post-mount.
- The run-selector scoping regression where DSR1 (no model-matching
  changelog on its date) tried to fetch with a runId from a different
  model's run and got zero rows. Only pass runId when there are >1 runs
  whose CHANGELOG explicitly mentions the current model + precision.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/app/cypress/support/mock-data.ts     |   2 +-
 .../api/v1/derived-agentic-metrics/route.ts   |  68 ++++++
 .../components/inference/InferenceContext.tsx |  93 ++++++--
 .../app/src/components/inference/types.ts     |  10 +-
 .../components/inference/ui/ChartDisplay.tsx  | 114 +++++++--
 .../hooks/api/use-derived-agentic-metrics.ts  |  41 ++++
 .../queries/derived-agentic-metrics.test.ts   |  96 ++++++++
 .../db/src/queries/derived-agentic-metrics.ts | 224 ++++++++++++++++++
 8 files changed, 612 insertions(+), 36 deletions(-)
 create mode 100644 packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
 create mode 100644 packages/app/src/hooks/api/use-derived-agentic-metrics.ts
 create mode 100644 packages/db/src/queries/derived-agentic-metrics.test.ts
 create mode 100644 packages/db/src/queries/derived-agentic-metrics.ts

diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts
index 2d3c982f..152e3f98 100644
--- a/packages/app/cypress/support/mock-data.ts
+++ b/packages/app/cypress/support/mock-data.ts
@@ -195,7 +195,7 @@ export function createMockInferenceContext(
     setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'),
     selectedE2eXAxisMetric: null,
     setSelectedE2eXAxisMetric: namedStub('setSelectedE2eXAxisMetric'),
-    selectedXAxisMode: 'interactivity',
+    selectedXAxisMode: 'interactivity' as const,
     setSelectedXAxisMode: namedStub('setSelectedXAxisMode'),
     scaleType: 'auto',
     setScaleType: namedStub('setScaleType'),
diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
new file mode 100644
index 00000000..e5f6e0b2
--- /dev/null
+++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
@@ -0,0 +1,68 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getDerivedAgenticMetrics,
+  type DerivedAgenticMetricMap,
+} from '@semianalysisai/inferencex-db/queries/derived-agentic-metrics';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+// blobOnly: the response is one entry per id with two numbers, but the
+// derivation work parses thousands of JSONL records per blob — cache the
+// computed result so a chart-refresh hits the warm path.
+const getCachedDerivedAgenticMetrics = cachedQuery(
+  (ids: number[]): Promise<DerivedAgenticMetricMap> => getDerivedAgenticMetrics(getDb(), ids),
+  'derived-agentic-metrics',
+  { blobOnly: true },
+);
+
+const MAX_IDS_PER_REQUEST = 200;
+
+/**
+ * GET /api/v1/derived-agentic-metrics?ids=1,2,3
+ *
+ * Returns per-id derived metrics computed live from the stored aiperf
+ * profile_export.jsonl blobs:
+ *  - normalized_session_time_s: mean across sessions of session e2e time
+ *    (Σ per-turn request_latency) rescaled by mean_load / session_load.
+ *  - mean_p90_prefill_tps_per_user: mean across sessions of P90 (over the
+ *    session's turns) prefill TPS/user (ISL / TTFT).
+ *
+ * Ids without a trace_replay blob or with unparseable records are omitted.
+ */
+export async function GET(request: NextRequest) {
+  const raw = request.nextUrl.searchParams.get('ids');
+  if (!raw) {
+    return NextResponse.json({ error: 'ids query param is required' }, { status: 400 });
+  }
+
+  const ids = [
+    ...new Set(
+      raw
+        .split(',')
+        .map((s) => Number(s.trim()))
+        .filter((n) => Number.isFinite(n) && n > 0),
+    ),
+  ];
+  if (ids.length === 0) {
+    return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 });
+  }
+  if (ids.length > MAX_IDS_PER_REQUEST) {
+    return NextResponse.json(
+      { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` },
+      { status: 400 },
+    );
+  }
+
+  try {
+    const sorted = [...ids].toSorted((a, b) => a - b);
+    const result = await getCachedDerivedAgenticMetrics(sorted);
+    return cachedJson(result);
+  } catch (error) {
+    console.error('Error fetching derived agentic metrics:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index edf0974e..2e5a245f 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -137,17 +137,32 @@ export function InferenceProvider({
   // computing a kind-based default here would diverge between server and client
   // and cause a hydration mismatch. The scenario-kind default is applied in a
   // post-mount effect below (and a ref tracks whether the user has overridden).
-  const urlXMode = (() => {
+  type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps';
+  const VALID_X_MODES: XAxisMode[] = [
+    'ttft',
+    'e2e',
+    'interactivity',
+    'session-time',
+    'prefill-tps',
+  ];
+  // SSR has no URL access, so seed with a fixed default and apply the URL
+  // value (if any) in a post-mount effect — keeps server + client first render
+  // identical and avoids "didn't match" hydration warnings when the URL holds
+  // a non-default mode.
+  const [selectedXAxisMode, setSelectedXAxisMode] = useState<XAxisMode>('ttft');
+  const xAxisModeFromUrlRef = useRef(false);
+  useEffect(() => {
+    if (xAxisModeFromUrlRef.current) return;
     const v = getUrlParam('i_xmode');
-    return v === 'ttft' || v === 'e2e' || v === 'interactivity' ? v : null;
-  })();
-  const [selectedXAxisMode, setSelectedXAxisMode] = useState<'ttft' | 'e2e' | 'interactivity'>(
-    urlXMode ?? 'ttft',
-  );
-  const xAxisModeFromUrlRef = useRef(urlXMode !== null);
+    if (v && (VALID_X_MODES as string[]).includes(v)) {
+      xAxisModeFromUrlRef.current = true;
+      setSelectedXAxisMode(v as XAxisMode);
+    }
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, []);
   // Wrap the setter so a button click also aligns selectedE2eXAxisMetric — the
   // existing useChartData pipeline keys off that flag for the e2e chart's x-axis.
-  const handleSetXAxisMode = useCallback((mode: 'ttft' | 'e2e' | 'interactivity') => {
+  const handleSetXAxisMode = useCallback((mode: XAxisMode) => {
     xAxisModeFromUrlRef.current = true;
     setSelectedXAxisMode(mode);
     // The e2e chart's x-axis metric is reconciled in a separate effect below,
@@ -215,12 +230,37 @@ export function InferenceProvider({
   const latestDate = availableDates.length > 0 ? availableDates.at(-1) : undefined;
 
   // Run-selector scoping: only constrain benchmark data to a specific run when
-  // the current date has >1 runs (ambiguous case). When there's one run per
-  // date, the picker is informational and the SQL's latest-per-config logic
-  // already returns that run's data — passing runId would needlessly narrow
-  // the cross-date config view.
-  const multipleRunsOnDate = availableRuns && Object.keys(availableRuns).length > 1;
-  const benchmarkRunId = multipleRunsOnDate && selectedRunId ? String(selectedRunId) : undefined;
+  // there's actually a disambiguation to make for the CURRENT model. The
+  // raw `availableRuns` is across ALL models on the date, so the picker may
+  // auto-select a run that produced nothing for the current model — passing
+  // that runId would return zero rows and hide the chart entirely.
+  // Compute the set of runs whose CHANGELOG explicitly mentions this model +
+  // precision. We can't reuse `filterRunsByModel` here because it has a
+  // fallback that returns all runs when nothing matches (so the picker still
+  // renders) — which would make us pass a runId that produced no rows for
+  // the current model, hiding the chart.
+  const modelPrefixesEarly = Object.entries(MODEL_PREFIX_MAPPING)
+    .filter(([, model]) => model === selectedModel)
+    .map(([prefix]) => prefix);
+  const runIdsWithModelChangelog: string[] = [];
+  if (availableRuns) {
+    for (const [runId, runInfo] of Object.entries(availableRuns)) {
+      if (!runInfo.changelog) continue;
+      const matches = runInfo.changelog.entries.some((entry) =>
+        entry.config_keys.some((key) => {
+          const parts = key.split('-');
+          return modelPrefixesEarly.includes(parts[0]!) && effectivePrecisions.includes(parts[1]!);
+        }),
+      );
+      if (matches) runIdsWithModelChangelog.push(runId);
+    }
+  }
+  const benchmarkRunId =
+    selectedRunId &&
+    runIdsWithModelChangelog.length > 1 &&
+    runIdsWithModelChangelog.includes(selectedRunId)
+      ? String(selectedRunId)
+      : undefined;
 
   const {
     graphs,
@@ -367,11 +407,30 @@ export function InferenceProvider({
   useEffect(() => {
     const kind = sequenceKind(effectiveSequence);
     const isInitialMount = lastSeqKindRef.current === null;
-    if (!isInitialMount && lastSeqKindRef.current === kind) return;
+    const isAgenticOnlyMode =
+      selectedXAxisMode === 'session-time' || selectedXAxisMode === 'prefill-tps';
+    // On a stale render where kind hasn't changed, bail unless the current
+    // mode is agentic-only and we just landed on a fixed-seq scenario — in
+    // that case force the snap so the chart doesn't try to plot trace-derived
+    // metrics against rows that have no trace_replay.
+    if (!isInitialMount && lastSeqKindRef.current === kind) {
+      if (kind === 'fixed-seq' && isAgenticOnlyMode) {
+        handleSetXAxisMode('interactivity');
+      }
+      return;
+    }
     lastSeqKindRef.current = kind;
-    if (isInitialMount && xAxisModeFromUrlRef.current) return;
+    if (
+      isInitialMount &&
+      xAxisModeFromUrlRef.current &&
+      !(kind === 'fixed-seq' && isAgenticOnlyMode)
+    ) {
+      // URL-restored agentic-only mode on a fixed-seq sequence makes no sense
+      // — fall through to the default snap below.
+      return;
+    }
     handleSetXAxisMode(kind === 'agentic' ? 'ttft' : 'interactivity');
-  }, [effectiveSequence, handleSetXAxisMode]);
+  }, [effectiveSequence, selectedXAxisMode, handleSetXAxisMode]);
 
   // Reconcile selectedE2eXAxisMetric whenever the mode, sequence kind, or
   // agentic percentile changes. For fixed-seq the JSONB only carries
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index 0a9908e3..bedded40 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -544,13 +544,17 @@ export interface InferenceChartContextType {
   setSelectedE2eXAxisMetric: (metric: string | null) => void;
   /**
    * Which chart variant the user wants to see — the inference card shows one chart
-   * at a time, picked by the big TTFT / E2E Latency / Interactivity buttons.
+   * at a time, picked by the big buttons above the chart.
    * - 'ttft'          → e2e chartType with x-axis forced to p90_ttft
    * - 'e2e'           → e2e chartType with the chart-config default x-axis (median_e2el / p90_e2el)
    * - 'interactivity' → interactivity chartType (x = median_intvty / p90_intvty)
+   * - 'session-time'  → agentic-only; x = mean-normalized session time (live-computed from trace blobs)
+   * - 'prefill-tps'   → agentic-only; x = mean of P90 prefill TPS/user per session
    */
-  selectedXAxisMode: 'ttft' | 'e2e' | 'interactivity';
-  setSelectedXAxisMode: (mode: 'ttft' | 'e2e' | 'interactivity') => void;
+  selectedXAxisMode: 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps';
+  setSelectedXAxisMode: (
+    mode: 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps',
+  ) => void;
   scaleType: 'auto' | 'linear' | 'log';
   setScaleType: (type: 'auto' | 'linear' | 'log') => void;
   setIsLegendExpanded: (metric: boolean) => void;
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 12f9f5de..63953b30 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -1,7 +1,7 @@
 'use client';
 import { track } from '@/lib/analytics';
 import dynamic from 'next/dynamic';
-import { useMemo, useRef, useState } from 'react';
+import { useEffect, useMemo, useRef, useState } from 'react';
 import { BarChart3, Table2, X } from 'lucide-react';
 
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
@@ -42,6 +42,7 @@ import {
   sequenceKind,
 } from '@/lib/data-mappings';
 import { useComparisonChangelogs } from '@/hooks/api/use-comparison-changelogs';
+import { useDerivedAgenticMetrics } from '@/hooks/api/use-derived-agentic-metrics';
 import { useTrendData } from '@/components/inference/hooks/useTrendData';
 import { hardwareKeyMatchesAnyBase } from '@/lib/constants';
 
@@ -62,20 +63,25 @@ import WorkflowInfoDisplay from './WorkflowInfoDisplay';
 type InferenceViewMode = 'chart' | 'table';
 
 /**
- * The three chart variants the user can choose with the big buttons above the
- * chart card. Each maps to one entry in `inference-chart-config.json` plus a
- * forced x-axis override for the E2E chartType.
+ * The chart variants the user can choose with the big buttons above the chart
+ * card. The first three map to entries in `inference-chart-config.json` plus a
+ * forced x-axis override for the E2E chartType; the last two are agentic-only
+ * derived metrics computed live from the stored trace_replay blobs.
  */
-type XAxisMode = 'ttft' | 'e2e' | 'interactivity';
+type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps';
 
 interface XAxisModeButton {
   value: XAxisMode;
   label: string;
+  /** When true, the button is only shown on agentic scenarios. */
+  agenticOnly?: boolean;
 }
 const X_AXIS_MODE_BUTTONS: XAxisModeButton[] = [
   { value: 'ttft', label: 'TTFT' },
   { value: 'e2e', label: 'E2E Latency' },
   { value: 'interactivity', label: 'Interactivity' },
+  { value: 'session-time', label: 'Session Time', agenticOnly: true },
+  { value: 'prefill-tps', label: 'Prefill TPS / user', agenticOnly: true },
 ];
 
 const VIEW_MODE_OPTIONS: SegmentedToggleOption<InferenceViewMode>[] = [
@@ -134,6 +140,13 @@ export default function ChartDisplay() {
     totalDatesQueried,
   } = useComparisonChangelogs(selectedGPUs, selectedDateRange, dateRangeAvailableDates);
 
+  // SSR has no URL access and `selectedSequence` defaults to agentic on the
+  // server even when the URL says fixed-seq — so any conditional rendering
+  // that keys off `sequenceKind(selectedSequence)` would diverge between
+  // server and client first render. Defer agentic-only UI until after mount.
+  const [mounted, setMounted] = useState(false);
+  useEffect(() => setMounted(true), []);
+
   const [viewModes, setViewModes] = useState<Record<number, InferenceViewMode>>({});
   const replayHandlesRef = useRef<Record<number, ReplayLauncherHandle | null>>({});
   const getViewMode = (index: number): InferenceViewMode => viewModes[index] ?? 'chart';
@@ -301,15 +314,74 @@ export default function ChartDisplay() {
     }));
   }, [graphs, overlayDataByChartType, selectedModel, selectedSequence]);
 
-  // Show one chart at a time, picked by the TTFT / E2E / Interactivity buttons.
-  // Both 'ttft' and 'e2e' modes render the e2e chart (the x-axis swap is handled
-  // upstream by `selectedE2eXAxisMetric`, which `setSelectedXAxisMode` keeps in sync).
+  // Show one chart at a time, picked by the buttons above the chart.
+  //  - 'interactivity' renders the interactivity chartType.
+  //  - 'ttft' / 'e2e' render the e2e chartType (x swap via selectedE2eXAxisMetric).
+  //  - 'session-time' / 'prefill-tps' render the e2e chartType too; the x-axis
+  //    is overridden below from live-computed derived metrics.
   const visibleGraphs = useMemo(() => {
     const wantedType = selectedXAxisMode === 'interactivity' ? 'interactivity' : 'e2e';
     const filtered = effectiveGraphs.filter((g) => g.chartDefinition.chartType === wantedType);
     return filtered.length > 0 ? filtered : effectiveGraphs;
   }, [effectiveGraphs, selectedXAxisMode]);
 
+  // Derived-metric path: fetch live-computed values from the trace_replay blobs
+  // and override scatter data.x. Only fires for the two agentic-only modes.
+  const useDerived =
+    sequenceKind(selectedSequence) === 'agentic' &&
+    (selectedXAxisMode === 'session-time' || selectedXAxisMode === 'prefill-tps');
+  const derivedTargetIds = useMemo(() => {
+    if (!useDerived) return [] as number[];
+    const ids = new Set<number>();
+    for (const g of visibleGraphs) {
+      for (const d of g.data) {
+        if (d.benchmark_type === 'agentic_traces' && typeof d.id === 'number') {
+          ids.add(d.id);
+        }
+      }
+    }
+    return [...ids];
+  }, [useDerived, visibleGraphs]);
+  const derivedQuery = useDerivedAgenticMetrics(derivedTargetIds, useDerived);
+  const derivedMetrics = derivedQuery.data;
+
+  const renderableGraphs = useMemo(() => {
+    if (!useDerived) return visibleGraphs;
+    if (!derivedMetrics) return visibleGraphs.map((g) => ({ ...g, data: [] }));
+    const isSession = selectedXAxisMode === 'session-time';
+    const xLabel = isSession
+      ? 'Mean Normalized Session Time (s)'
+      : 'Mean P90 Prefill TPS per user (tok/s)';
+    // Roofline corner = which corner the curve sweeps from / toward, matching
+    // existing chart-config convention:
+    //  - session-time: as concurrency rises, session time AND throughput both
+    //    grow → curve goes bottom-left → top-right → upper_right.
+    //  - prefill-tps:  as concurrency rises, per-user prefill TPS falls while
+    //    total throughput rises → curve goes top-left → bottom-right →
+    //    upper_left.
+    const rooflineCorner = isSession ? 'upper_right' : 'upper_left';
+    return visibleGraphs.map((g) => {
+      const overriddenChartDef = {
+        ...g.chartDefinition,
+        x_label: xLabel,
+        // y_latency_limit was meant to suppress fixed-seq overload outliers on
+        // the TTFT axis — irrelevant for these derived axes.
+        y_latency_limit: undefined,
+        [`${selectedYAxisMetric}_roofline` as keyof typeof g.chartDefinition]: rooflineCorner,
+      };
+      const data = g.data
+        .map((d) => {
+          if (typeof d.id !== 'number') return null;
+          const m = derivedMetrics[d.id];
+          const v = isSession ? m?.normalized_session_time_s : m?.mean_p90_prefill_tps_per_user;
+          if (v === null || v === undefined || !Number.isFinite(v)) return null;
+          return { ...d, x: v };
+        })
+        .filter((d): d is NonNullable<typeof d> => d !== null);
+      return { ...g, chartDefinition: overriddenChartDef, data };
+    });
+  }, [useDerived, visibleGraphs, derivedMetrics, selectedXAxisMode, selectedYAxisMetric]);
+
   const displayGraphs = isFirstLoad
     ? [
         <Card key="skeleton-0">
@@ -318,9 +390,9 @@ export default function ChartDisplay() {
           <Skeleton className="h-[600px] w-full" />
         </Card>,
       ]
-    : visibleGraphs.length === 0
+    : renderableGraphs.length === 0
       ? []
-      : visibleGraphs.map((graph, graphIndex) => {
+      : renderableGraphs.map((graph, graphIndex) => {
           const isTimelineMode = Boolean(
             selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
           );
@@ -396,11 +468,16 @@ export default function ChartDisplay() {
                               return 'vs. P90 Time To First Token';
                             }
 
-                            // For e2e chart: heading is driven by the TTFT / E2E button
-                            // selection above the card, so the inline dropdown is gone.
-                            // The metric carries the percentile prefix (e.g. p90_ttft,
-                            // median_ttft for fixed-seq, p75_ttft for agentic+p75).
+                            // For e2e chart: heading is driven by the buttons above the
+                            // card. Derived-metric modes win first; otherwise the metric
+                            // carries the percentile prefix (e.g. p90_ttft, median_ttft).
                             if (graph.chartDefinition.chartType === 'e2e') {
+                              if (selectedXAxisMode === 'session-time') {
+                                return 'vs. Mean Normalized Session Time';
+                              }
+                              if (selectedXAxisMode === 'prefill-tps') {
+                                return 'vs. Mean P90 Prefill TPS / user';
+                              }
                               const isAgentic = sequenceKind(selectedSequence) === 'agentic';
                               if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
                                 const pctl = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
@@ -601,7 +678,14 @@ export default function ChartDisplay() {
         aria-label="Chart x-axis metric"
         data-testid="x-axis-mode-buttons"
       >
-        {X_AXIS_MODE_BUTTONS.map(({ value, label }) => {
+        {X_AXIS_MODE_BUTTONS.filter(({ agenticOnly }) => {
+          if (!agenticOnly) return true;
+          // Before client mount, conditionalize on the server-default kind
+          // (agentic) so SSR + first client render produce identical DOM. After
+          // mount, hide the agentic-only buttons on fixed-seq sequences.
+          if (!mounted) return true;
+          return sequenceKind(selectedSequence) === 'agentic';
+        }).map(({ value, label }) => {
           const isActive = selectedXAxisMode === value;
           return (
             <button
diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
new file mode 100644
index 00000000..108312ee
--- /dev/null
+++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
@@ -0,0 +1,41 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface DerivedAgenticMetric {
+  id: number;
+  /** Mean across sessions of e2e time (Σ per-turn request_latency) rescaled
+   *  by mean_load / session_load. Null when the JSONL had no usable records. */
+  normalized_session_time_s: number | null;
+  /** Mean across sessions of (P90 over turns of ISL/TTFT). Null when no
+   *  prefill rates could be computed. */
+  mean_p90_prefill_tps_per_user: number | null;
+}
+
+export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
+
+async function fetchDerivedAgenticMetrics(
+  ids: number[],
+  signal?: AbortSignal,
+): Promise<DerivedAgenticMetricMap> {
+  if (ids.length === 0) return {};
+  const res = await fetch(`/api/v1/derived-agentic-metrics?ids=${ids.join(',')}`, { signal });
+  if (!res.ok) throw new Error(`derived-agentic-metrics ${res.status}`);
+  return (await res.json()) as DerivedAgenticMetricMap;
+}
+
+/**
+ * Fetch per-id derived agentic metrics (session time + p90 prefill TPS/user)
+ * computed live from the stored aiperf profile_export.jsonl. Used to drive
+ * the "Session Time" and "Prefill TPS/user" chart variants.
+ *
+ * Ids without a trace_replay blob (older or non-aiperf agentic runs) are
+ * silently omitted from the response.
+ */
+export function useDerivedAgenticMetrics(ids: number[], enabled = true) {
+  const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+  return useQuery({
+    queryKey: ['derived-agentic-metrics', sortedKey.join(',')] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) => fetchDerivedAgenticMetrics(sortedKey, signal),
+    enabled: enabled && sortedKey.length > 0,
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts
new file mode 100644
index 00000000..795be28a
--- /dev/null
+++ b/packages/db/src/queries/derived-agentic-metrics.test.ts
@@ -0,0 +1,96 @@
+import { describe, expect, it } from 'vitest';
+
+import { computeDerivedFromBlob } from './derived-agentic-metrics.js';
+
+/** Build one aiperf JSONL record for the synthetic fixture. */
+function rec(
+  conversation_id: string,
+  turn_index: number,
+  fields: { isl: number; osl: number; ttft_ms: number; latency_ms: number },
+): string {
+  return JSON.stringify({
+    metadata: { conversation_id, turn_index, benchmark_phase: 'profiling' },
+    metrics: {
+      request_latency: { value: fields.latency_ms, unit: 'ms' },
+      time_to_first_token: { value: fields.ttft_ms, unit: 'ms' },
+      input_sequence_length: { value: fields.isl, unit: 'tokens' },
+      output_sequence_length: { value: fields.osl, unit: 'tokens' },
+    },
+  });
+}
+
+describe('computeDerivedFromBlob', () => {
+  it('returns nulls when no usable records', () => {
+    const out = computeDerivedFromBlob('');
+    expect(out.normalized_session_time_s).toBeNull();
+    expect(out.mean_p90_prefill_tps_per_user).toBeNull();
+  });
+
+  it('rescales single-session time and computes P90 prefill', () => {
+    // One session, two turns. load = (100+50) + (200+50) = 400.
+    // Single session ⇒ mean_load = load_i ⇒ T̃ = T = (1000+2000) ms = 3.0 s.
+    const jsonl = [
+      rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }),
+      rec('s1', 1, { isl: 200, osl: 50, ttft_ms: 1000, latency_ms: 2000 }),
+    ].join('\n');
+    const out = computeDerivedFromBlob(jsonl);
+    expect(out.normalized_session_time_s).toBeCloseTo(3, 6);
+    // Prefill TPS per turn: 100/0.5=200, 200/1.0=200 → P90 within session = 200.
+    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+  });
+
+  it('rescales times across sessions with unequal load', () => {
+    // s1: 1 turn, load = 100, T = 1s
+    // s2: 1 turn, load = 300, T = 3s
+    // mean_load = 200; T̃_1 = 1 * 200/100 = 2; T̃_2 = 3 * 200/300 = 2
+    // Mean T̃ = 2.0
+    const jsonl = [
+      rec('s1', 0, { isl: 90, osl: 10, ttft_ms: 500, latency_ms: 1000 }),
+      rec('s2', 0, { isl: 270, osl: 30, ttft_ms: 500, latency_ms: 3000 }),
+    ].join('\n');
+    const out = computeDerivedFromBlob(jsonl);
+    expect(out.normalized_session_time_s).toBeCloseTo(2, 6);
+  });
+
+  it('drops records missing required fields and skips non-profiling phase', () => {
+    const lines = [
+      rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }),
+      // missing TTFT — should be skipped
+      JSON.stringify({
+        metadata: { conversation_id: 's1', turn_index: 1, benchmark_phase: 'profiling' },
+        metrics: {
+          request_latency: { value: 1000, unit: 'ms' },
+          input_sequence_length: { value: 100, unit: 'tokens' },
+          output_sequence_length: { value: 50, unit: 'tokens' },
+        },
+      }),
+      // warmup phase — should be skipped
+      JSON.stringify({
+        metadata: { conversation_id: 's2', turn_index: 0, benchmark_phase: 'warmup' },
+        metrics: {
+          request_latency: { value: 9999, unit: 'ms' },
+          time_to_first_token: { value: 9999, unit: 'ms' },
+          input_sequence_length: { value: 100, unit: 'tokens' },
+          output_sequence_length: { value: 50, unit: 'tokens' },
+        },
+      }),
+    ];
+    const out = computeDerivedFromBlob(lines.join('\n'));
+    expect(out.normalized_session_time_s).toBeCloseTo(1, 6);
+    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+  });
+
+  it('p90 across turns: 10-turn session picks the right rank', () => {
+    // Prefill rates 100..1000 (per turn isl/ttft); p90 of 10 values (linear) = 910.
+    const turns = Array.from({ length: 10 }, (_, i) =>
+      rec('s1', i, {
+        isl: (i + 1) * 100, // 100, 200, ..., 1000 tokens
+        osl: 10,
+        ttft_ms: 1000, // 1 second → rates: 100..1000 tps
+        latency_ms: 1500,
+      }),
+    );
+    const out = computeDerivedFromBlob(turns.join('\n'));
+    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(910, 6);
+  });
+});
diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
new file mode 100644
index 00000000..14f3adcf
--- /dev/null
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -0,0 +1,224 @@
+/**
+ * Live-computed per-point metrics derived from the stored aiperf
+ * `profile_export.jsonl` blob. These aren't precomputed in the metrics JSONB
+ * because they require grouping by `conversation_id` and aggregating per
+ * session — work that's cheap once per agentic point but adds up to be
+ * meaningful only when actually plotted.
+ *
+ * - normalized_session_time_s: per the "Mean Normalized Session Time" proposal
+ *   (https://gist.github.com/xinli-sw/115d370c17f6d1b977878b68530981fa). Sum of
+ *   per-turn `request_latency` per session (inter-turn tool/thinking gaps are
+ *   inherently excluded since we only sum the active GPU time, not wallclock).
+ *   Each session's time is rescaled by `mean_load / session_load`, where load
+ *   is Σ(ISL+OSL) across turns. The plotted value is the mean across sessions.
+ *
+ * - mean_p90_prefill_tps_per_user: per the same gist's "Prefill" Pareto chart.
+ *   Per turn: prefill_tps = ISL / TTFT_seconds. Per session: P90 across its
+ *   turns. Across sessions: arithmetic mean. Captures the worst-turn prefill
+ *   responsiveness from the end-user perspective.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import type { DbClient } from '../connection.js';
+
+export interface DerivedAgenticMetric {
+  /** benchmark_results.id this entry belongs to. */
+  id: number;
+  /** Mean normalized session time in seconds. */
+  normalized_session_time_s: number | null;
+  /** Mean across sessions of (P90 prefill tps/user across the session's turns). */
+  mean_p90_prefill_tps_per_user: number | null;
+}
+
+export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
+
+/**
+ * JSONL blobs can be ~1-2 MB compressed (~5-10 MB raw) and Neon's serverless
+ * HTTP driver caps responses at 64 MB — chunk to stay well under.
+ */
+const QUERY_CHUNK_SIZE = 6;
+
+interface RecordMetrics {
+  request_latency?: { value?: number; unit?: string } | number;
+  time_to_first_token?: { value?: number; unit?: string } | number;
+  input_sequence_length?: { value?: number } | number;
+  output_sequence_length?: { value?: number } | number;
+}
+
+interface RecordMetadata {
+  conversation_id?: string;
+  turn_index?: number;
+  benchmark_phase?: string;
+}
+
+interface ProfileRecord {
+  metadata?: RecordMetadata;
+  metrics?: RecordMetrics;
+}
+
+interface TurnFields {
+  request_latency_ms: number;
+  ttft_ms: number;
+  isl: number;
+  osl: number;
+}
+
+/** Pull a numeric metric out of the {value, unit} envelope (or a bare number). */
+function readNum(v: unknown): number | undefined {
+  if (typeof v === 'number') return v;
+  if (v && typeof v === 'object' && 'value' in v) {
+    const inner = (v as { value?: unknown }).value;
+    if (typeof inner === 'number' && Number.isFinite(inner)) return inner;
+  }
+  return undefined;
+}
+
+function extractTurn(rec: ProfileRecord): TurnFields | null {
+  const m = rec.metrics ?? {};
+  const rl = readNum(m.request_latency);
+  const tt = readNum(m.time_to_first_token);
+  const isl = readNum(m.input_sequence_length);
+  const osl = readNum(m.output_sequence_length);
+  if (rl === undefined || tt === undefined || isl === undefined || osl === undefined) return null;
+  if (rl <= 0 || tt <= 0 || isl <= 0) return null;
+  return { request_latency_ms: rl, ttft_ms: tt, isl, osl };
+}
+
+/** Linear-interpolated percentile (matches numpy's default linear method). */
+function quantile(sortedAsc: number[], q: number): number {
+  if (sortedAsc.length === 0) return Number.NaN;
+  if (sortedAsc.length === 1) return sortedAsc[0]!;
+  const pos = (sortedAsc.length - 1) * q;
+  const lo = Math.floor(pos);
+  const hi = Math.ceil(pos);
+  if (lo === hi) return sortedAsc[lo]!;
+  return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo);
+}
+
+function meanOf(xs: number[]): number {
+  if (xs.length === 0) return Number.NaN;
+  let s = 0;
+  for (const x of xs) s += x;
+  return s / xs.length;
+}
+
+/**
+ * Parse one point's JSONL and return the two derived metrics. Returns
+ * `{ session_time: null, prefill: null }` if the blob has no usable records.
+ */
+export function computeDerivedFromBlob(jsonl: string): {
+  normalized_session_time_s: number | null;
+  mean_p90_prefill_tps_per_user: number | null;
+} {
+  // Group records by conversation_id, filter to the profiling phase.
+  const bySession = new Map<string, TurnFields[]>();
+  for (const line of jsonl.split('\n')) {
+    if (!line) continue;
+    let rec: ProfileRecord;
+    try {
+      rec = JSON.parse(line) as ProfileRecord;
+    } catch {
+      continue;
+    }
+    if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue;
+    const sid = rec.metadata?.conversation_id;
+    if (!sid) continue;
+    const turn = extractTurn(rec);
+    if (!turn) continue;
+    let list = bySession.get(sid);
+    if (!list) {
+      list = [];
+      bySession.set(sid, list);
+    }
+    list.push(turn);
+  }
+  if (bySession.size === 0) {
+    return { normalized_session_time_s: null, mean_p90_prefill_tps_per_user: null };
+  }
+
+  // Per-session aggregates.
+  const sessionTimesS: number[] = [];
+  const sessionLoads: number[] = [];
+  const sessionP90Prefill: number[] = [];
+  for (const turns of bySession.values()) {
+    let timeMs = 0;
+    let load = 0;
+    const prefillRates: number[] = [];
+    for (const t of turns) {
+      timeMs += t.request_latency_ms;
+      load += t.isl + t.osl;
+      const ttftSec = t.ttft_ms / 1000;
+      if (ttftSec > 0) prefillRates.push(t.isl / ttftSec);
+    }
+    if (load > 0) {
+      sessionTimesS.push(timeMs / 1000);
+      sessionLoads.push(load);
+    }
+    if (prefillRates.length > 0) {
+      prefillRates.sort((a, b) => a - b);
+      sessionP90Prefill.push(quantile(prefillRates, 0.9));
+    }
+  }
+
+  // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean.
+  let normalized: number | null = null;
+  if (sessionTimesS.length > 0) {
+    const meanLoad = meanOf(sessionLoads);
+    if (meanLoad > 0) {
+      const scaled: number[] = [];
+      for (let i = 0; i < sessionTimesS.length; i++) {
+        const ti = sessionTimesS[i]!;
+        const li = sessionLoads[i]!;
+        if (li > 0) scaled.push(ti * (meanLoad / li));
+      }
+      normalized = scaled.length > 0 ? meanOf(scaled) : null;
+    }
+  }
+
+  const prefill = sessionP90Prefill.length > 0 ? meanOf(sessionP90Prefill) : null;
+
+  return {
+    normalized_session_time_s: normalized,
+    mean_p90_prefill_tps_per_user: prefill,
+  };
+}
+
+export async function getDerivedAgenticMetrics(
+  sql: DbClient,
+  benchmarkResultIds: number[],
+): Promise<DerivedAgenticMetricMap> {
+  if (benchmarkResultIds.length === 0) return {};
+
+  const rows: { benchmark_result_id: number; blob: Buffer }[] = [];
+  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
+    const chunkRows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.profile_export_jsonl_gz as blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+        and atr.profile_export_jsonl_gz is not null
+    `) as { benchmark_result_id: number; blob: Buffer }[];
+    rows.push(...chunkRows);
+  }
+
+  const result: DerivedAgenticMetricMap = {};
+  for (const row of rows) {
+    try {
+      const jsonl = gunzipSync(row.blob).toString('utf8');
+      const { normalized_session_time_s, mean_p90_prefill_tps_per_user } =
+        computeDerivedFromBlob(jsonl);
+      result[Number(row.benchmark_result_id)] = {
+        id: Number(row.benchmark_result_id),
+        normalized_session_time_s,
+        mean_p90_prefill_tps_per_user,
+      };
+    } catch {
+      // Skip malformed blobs silently — frontend treats missing ids as "no data".
+    }
+  }
+  return result;
+}

From 8af1f5cd42f6d423ded91c04310345a09343fa34 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Thu, 21 May 2026 01:20:29 -0400
Subject: [PATCH 32/55] fix(inference): show Mean Normalized Session Time in
 minutes

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/inference/ui/ChartDisplay.tsx | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 63953b30..6be524b4 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -350,7 +350,7 @@ export default function ChartDisplay() {
     if (!derivedMetrics) return visibleGraphs.map((g) => ({ ...g, data: [] }));
     const isSession = selectedXAxisMode === 'session-time';
     const xLabel = isSession
-      ? 'Mean Normalized Session Time (s)'
+      ? 'Mean Normalized Session Time (min)'
       : 'Mean P90 Prefill TPS per user (tok/s)';
     // Roofline corner = which corner the curve sweeps from / toward, matching
     // existing chart-config convention:
@@ -373,8 +373,9 @@ export default function ChartDisplay() {
         .map((d) => {
           if (typeof d.id !== 'number') return null;
           const m = derivedMetrics[d.id];
-          const v = isSession ? m?.normalized_session_time_s : m?.mean_p90_prefill_tps_per_user;
-          if (v === null || v === undefined || !Number.isFinite(v)) return null;
+          const raw = isSession ? m?.normalized_session_time_s : m?.mean_p90_prefill_tps_per_user;
+          if (raw === null || raw === undefined || !Number.isFinite(raw)) return null;
+          const v = isSession ? raw / 60 : raw;
           return { ...d, x: v };
         })
         .filter((d): d is NonNullable<typeof d> => d !== null);

From be34e97dd07ca02de674be04c312f62f779cc95a Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Thu, 21 May 2026 01:23:34 -0400
Subject: [PATCH 33/55] fix(inference): use global P90 of per-turn prefill
 TPS/user
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop the per-session P90 + cross-session mean sandwich; pool every turn
into one array and take a single P90 so the tail isn't dampened. Field
renamed mean_p90_prefill_tps_per_user → p90_prefill_tps_per_user across
DB query, API, frontend hook, and chart labels.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../api/v1/derived-agentic-metrics/route.ts   |  4 +-
 .../components/inference/ui/ChartDisplay.tsx  |  6 +--
 .../hooks/api/use-derived-agentic-metrics.ts  |  6 +--
 .../queries/derived-agentic-metrics.test.ts   | 10 ++---
 .../db/src/queries/derived-agentic-metrics.ts | 41 +++++++++----------
 5 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
index e5f6e0b2..c45173e5 100644
--- a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
+++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
@@ -28,8 +28,8 @@ const MAX_IDS_PER_REQUEST = 200;
  * profile_export.jsonl blobs:
  *  - normalized_session_time_s: mean across sessions of session e2e time
  *    (Σ per-turn request_latency) rescaled by mean_load / session_load.
- *  - mean_p90_prefill_tps_per_user: mean across sessions of P90 (over the
- *    session's turns) prefill TPS/user (ISL / TTFT).
+ *  - p90_prefill_tps_per_user: P90 of per-turn prefill TPS/user (ISL / TTFT)
+ *    across every turn in every session.
  *
  * Ids without a trace_replay blob or with unparseable records are omitted.
  */
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 6be524b4..bd3064d0 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -351,7 +351,7 @@ export default function ChartDisplay() {
     const isSession = selectedXAxisMode === 'session-time';
     const xLabel = isSession
       ? 'Mean Normalized Session Time (min)'
-      : 'Mean P90 Prefill TPS per user (tok/s)';
+      : 'P90 Prefill TPS per user (tok/s)';
     // Roofline corner = which corner the curve sweeps from / toward, matching
     // existing chart-config convention:
     //  - session-time: as concurrency rises, session time AND throughput both
@@ -373,7 +373,7 @@ export default function ChartDisplay() {
         .map((d) => {
           if (typeof d.id !== 'number') return null;
           const m = derivedMetrics[d.id];
-          const raw = isSession ? m?.normalized_session_time_s : m?.mean_p90_prefill_tps_per_user;
+          const raw = isSession ? m?.normalized_session_time_s : m?.p90_prefill_tps_per_user;
           if (raw === null || raw === undefined || !Number.isFinite(raw)) return null;
           const v = isSession ? raw / 60 : raw;
           return { ...d, x: v };
@@ -477,7 +477,7 @@ export default function ChartDisplay() {
                                 return 'vs. Mean Normalized Session Time';
                               }
                               if (selectedXAxisMode === 'prefill-tps') {
-                                return 'vs. Mean P90 Prefill TPS / user';
+                                return 'vs. P90 Prefill TPS / user';
                               }
                               const isAgentic = sequenceKind(selectedSequence) === 'agentic';
                               if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
index 108312ee..6bc7ae5e 100644
--- a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
+++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
@@ -5,9 +5,9 @@ export interface DerivedAgenticMetric {
   /** Mean across sessions of e2e time (Σ per-turn request_latency) rescaled
    *  by mean_load / session_load. Null when the JSONL had no usable records. */
   normalized_session_time_s: number | null;
-  /** Mean across sessions of (P90 over turns of ISL/TTFT). Null when no
-   *  prefill rates could be computed. */
-  mean_p90_prefill_tps_per_user: number | null;
+  /** P90 of per-turn ISL/TTFT across every turn in every session.
+   *  Null when no prefill rates could be computed. */
+  p90_prefill_tps_per_user: number | null;
 }
 
 export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts
index 795be28a..321434be 100644
--- a/packages/db/src/queries/derived-agentic-metrics.test.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.test.ts
@@ -23,7 +23,7 @@ describe('computeDerivedFromBlob', () => {
   it('returns nulls when no usable records', () => {
     const out = computeDerivedFromBlob('');
     expect(out.normalized_session_time_s).toBeNull();
-    expect(out.mean_p90_prefill_tps_per_user).toBeNull();
+    expect(out.p90_prefill_tps_per_user).toBeNull();
   });
 
   it('rescales single-session time and computes P90 prefill', () => {
@@ -35,8 +35,8 @@ describe('computeDerivedFromBlob', () => {
     ].join('\n');
     const out = computeDerivedFromBlob(jsonl);
     expect(out.normalized_session_time_s).toBeCloseTo(3, 6);
-    // Prefill TPS per turn: 100/0.5=200, 200/1.0=200 → P90 within session = 200.
-    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+    // Prefill TPS per turn: 100/0.5=200, 200/1.0=200 → global P90 = 200.
+    expect(out.p90_prefill_tps_per_user).toBeCloseTo(200, 6);
   });
 
   it('rescales times across sessions with unequal load', () => {
@@ -77,7 +77,7 @@ describe('computeDerivedFromBlob', () => {
     ];
     const out = computeDerivedFromBlob(lines.join('\n'));
     expect(out.normalized_session_time_s).toBeCloseTo(1, 6);
-    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+    expect(out.p90_prefill_tps_per_user).toBeCloseTo(200, 6);
   });
 
   it('p90 across turns: 10-turn session picks the right rank', () => {
@@ -91,6 +91,6 @@ describe('computeDerivedFromBlob', () => {
       }),
     );
     const out = computeDerivedFromBlob(turns.join('\n'));
-    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(910, 6);
+    expect(out.p90_prefill_tps_per_user).toBeCloseTo(910, 6);
   });
 });
diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
index 14f3adcf..ac6fd38d 100644
--- a/packages/db/src/queries/derived-agentic-metrics.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -12,10 +12,10 @@
  *   Each session's time is rescaled by `mean_load / session_load`, where load
  *   is Σ(ISL+OSL) across turns. The plotted value is the mean across sessions.
  *
- * - mean_p90_prefill_tps_per_user: per the same gist's "Prefill" Pareto chart.
- *   Per turn: prefill_tps = ISL / TTFT_seconds. Per session: P90 across its
- *   turns. Across sessions: arithmetic mean. Captures the worst-turn prefill
- *   responsiveness from the end-user perspective.
+ * - p90_prefill_tps_per_user: per the same gist's "Prefill" Pareto chart.
+ *   Per turn: prefill_tps = ISL / TTFT_seconds. Single P90 across every turn
+ *   in every session — the per-session percentile + cross-session mean
+ *   sandwich was discarded because it just dampens tail behavior.
  */
 
 import { gunzipSync } from 'node:zlib';
@@ -27,8 +27,8 @@ export interface DerivedAgenticMetric {
   id: number;
   /** Mean normalized session time in seconds. */
   normalized_session_time_s: number | null;
-  /** Mean across sessions of (P90 prefill tps/user across the session's turns). */
-  mean_p90_prefill_tps_per_user: number | null;
+  /** P90 of per-turn prefill tps/user (ISL / TTFT) across every turn in every session. */
+  p90_prefill_tps_per_user: number | null;
 }
 
 export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
@@ -109,7 +109,7 @@ function meanOf(xs: number[]): number {
  */
 export function computeDerivedFromBlob(jsonl: string): {
   normalized_session_time_s: number | null;
-  mean_p90_prefill_tps_per_user: number | null;
+  p90_prefill_tps_per_user: number | null;
 } {
   // Group records by conversation_id, filter to the profiling phase.
   const bySession = new Map<string, TurnFields[]>();
@@ -134,31 +134,27 @@ export function computeDerivedFromBlob(jsonl: string): {
     list.push(turn);
   }
   if (bySession.size === 0) {
-    return { normalized_session_time_s: null, mean_p90_prefill_tps_per_user: null };
+    return { normalized_session_time_s: null, p90_prefill_tps_per_user: null };
   }
 
-  // Per-session aggregates.
+  // Per-session aggregates for session time; per-turn prefill rates pool into
+  // a single global array so the percentile sees the full distribution.
   const sessionTimesS: number[] = [];
   const sessionLoads: number[] = [];
-  const sessionP90Prefill: number[] = [];
+  const allPrefillRates: number[] = [];
   for (const turns of bySession.values()) {
     let timeMs = 0;
     let load = 0;
-    const prefillRates: number[] = [];
     for (const t of turns) {
       timeMs += t.request_latency_ms;
       load += t.isl + t.osl;
       const ttftSec = t.ttft_ms / 1000;
-      if (ttftSec > 0) prefillRates.push(t.isl / ttftSec);
+      if (ttftSec > 0) allPrefillRates.push(t.isl / ttftSec);
     }
     if (load > 0) {
       sessionTimesS.push(timeMs / 1000);
       sessionLoads.push(load);
     }
-    if (prefillRates.length > 0) {
-      prefillRates.sort((a, b) => a - b);
-      sessionP90Prefill.push(quantile(prefillRates, 0.9));
-    }
   }
 
   // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean.
@@ -176,11 +172,15 @@ export function computeDerivedFromBlob(jsonl: string): {
     }
   }
 
-  const prefill = sessionP90Prefill.length > 0 ? meanOf(sessionP90Prefill) : null;
+  let prefill: number | null = null;
+  if (allPrefillRates.length > 0) {
+    allPrefillRates.sort((a, b) => a - b);
+    prefill = quantile(allPrefillRates, 0.9);
+  }
 
   return {
     normalized_session_time_s: normalized,
-    mean_p90_prefill_tps_per_user: prefill,
+    p90_prefill_tps_per_user: prefill,
   };
 }
 
@@ -209,12 +209,11 @@ export async function getDerivedAgenticMetrics(
   for (const row of rows) {
     try {
       const jsonl = gunzipSync(row.blob).toString('utf8');
-      const { normalized_session_time_s, mean_p90_prefill_tps_per_user } =
-        computeDerivedFromBlob(jsonl);
+      const { normalized_session_time_s, p90_prefill_tps_per_user } = computeDerivedFromBlob(jsonl);
       result[Number(row.benchmark_result_id)] = {
         id: Number(row.benchmark_result_id),
         normalized_session_time_s,
-        mean_p90_prefill_tps_per_user,
+        p90_prefill_tps_per_user,
       };
     } catch {
       // Skip malformed blobs silently — frontend treats missing ids as "no data".

From c774c005f7c2dfc1fa451e293df5d6456ba5be71 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Thu, 21 May 2026 01:29:27 -0400
Subject: [PATCH 34/55] fix(inference): no-data flash on session-time /
 prefill-tps modes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two root causes for "No data available" when flipping to these modes:

1. Stale blob-cache: the v1 cache key still holds responses with the
   pre-rename `mean_p90_prefill_tps_per_user` field. The frontend's new
   `p90_prefill_tps_per_user` lookup misses → every row filters out.
   Bump the cache key to `derived-agentic-metrics-v2` to force a refresh.

2. Loading flicker: while the derived-metrics fetch is in flight we were
   passing empty `data: []` to ScatterGraph, which surfaces the misleading
   "change your filters" empty-state. Gate skeleton rendering on the
   derived query's pending/fetching state instead.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../api/v1/derived-agentic-metrics/route.ts   |   5 +-
 .../components/inference/ui/ChartDisplay.tsx  | 439 +++++++++---------
 2 files changed, 230 insertions(+), 214 deletions(-)

diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
index c45173e5..6ce7c017 100644
--- a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
+++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
@@ -13,9 +13,12 @@ export const dynamic = 'force-dynamic';
 // blobOnly: the response is one entry per id with two numbers, but the
 // derivation work parses thousands of JSONL records per blob — cache the
 // computed result so a chart-refresh hits the warm path.
+// Bumped to v2 when mean_p90_prefill_tps_per_user → p90_prefill_tps_per_user.
+// Stale v1 cache entries return undefined for the new field and silently
+// blank the chart with "No data available".
 const getCachedDerivedAgenticMetrics = cachedQuery(
   (ids: number[]): Promise<DerivedAgenticMetricMap> => getDerivedAgenticMetrics(getDb(), ids),
-  'derived-agentic-metrics',
+  'derived-agentic-metrics-v2',
   { blobOnly: true },
 );
 
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index bd3064d0..fd6cd9c1 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -344,6 +344,14 @@ export default function ChartDisplay() {
   }, [useDerived, visibleGraphs]);
   const derivedQuery = useDerivedAgenticMetrics(derivedTargetIds, useDerived);
   const derivedMetrics = derivedQuery.data;
+  // Show skeleton (not "No data available") while the derived-metrics query
+  // is in flight. Without this gate, every flip to session-time / prefill-tps
+  // briefly blanks the chart and surfaces a misleading empty-state.
+  const isDerivedLoading =
+    useDerived &&
+    derivedTargetIds.length > 0 &&
+    (derivedQuery.isPending || derivedQuery.isFetching) &&
+    !derivedMetrics;
 
   const renderableGraphs = useMemo(() => {
     if (!useDerived) return visibleGraphs;
@@ -383,191 +391,181 @@ export default function ChartDisplay() {
     });
   }, [useDerived, visibleGraphs, derivedMetrics, selectedXAxisMode, selectedYAxisMetric]);
 
-  const displayGraphs = isFirstLoad
-    ? [
-        <Card key="skeleton-0">
-          <Skeleton className="h-7 w-2/4 mb-1" />
-          <Skeleton className="h-5 w-3/4 mb-2" />
-          <Skeleton className="h-[600px] w-full" />
-        </Card>,
-      ]
-    : renderableGraphs.length === 0
-      ? []
-      : renderableGraphs.map((graph, graphIndex) => {
-          const isTimelineMode = Boolean(
-            selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
-          );
-          const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode;
-          return (
-            <section key={graphIndex} className="pt-8 md:pt-0">
-              <figure data-testid="chart-figure" className="relative rounded-lg">
-                <ChartButtons
-                  chartId={`chart-${graphIndex}`}
-                  analyticsPrefix={
-                    isTimelineMode
-                      ? 'gpu_timeseries'
-                      : graph.chartDefinition.chartType === 'e2e'
-                        ? 'latency'
-                        : 'interactivity'
-                  }
-                  leadingControls={
-                    <SegmentedToggle
-                      value={getViewMode(graphIndex)}
-                      options={VIEW_MODE_OPTIONS}
-                      onValueChange={(v) => handleViewModeChange(graphIndex, v)}
-                      ariaLabel="View mode"
-                      testId={`inference-view-toggle-${graphIndex}`}
-                    />
-                  }
-                  hideImageExport={getViewMode(graphIndex) === 'table'}
-                  setIsLegendExpanded={setIsLegendExpanded}
-                  exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`}
-                  onExportMp4={
-                    replayAvailable ? () => replayHandlesRef.current[graphIndex]?.open() : undefined
-                  }
-                  onExportCsv={() => {
-                    const visibleData = graph.data.filter((d) =>
+  const displayGraphs =
+    isFirstLoad || isDerivedLoading
+      ? [
+          <Card key="skeleton-0">
+            <Skeleton className="h-7 w-2/4 mb-1" />
+            <Skeleton className="h-5 w-3/4 mb-2" />
+            <Skeleton className="h-[600px] w-full" />
+          </Card>,
+        ]
+      : renderableGraphs.length === 0
+        ? []
+        : renderableGraphs.map((graph, graphIndex) => {
+            const isTimelineMode = Boolean(
+              selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
+            );
+            const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode;
+            return (
+              <section key={graphIndex} className="pt-8 md:pt-0">
+                <figure data-testid="chart-figure" className="relative rounded-lg">
+                  <ChartButtons
+                    chartId={`chart-${graphIndex}`}
+                    analyticsPrefix={
                       isTimelineMode
-                        ? activeDates.has(`${d.date}_${d.hwKey}`)
-                        : activeHwTypes.has(d.hwKey as string) &&
-                          selectedPrecisions.includes(d.precision),
-                    );
-                    const { headers, rows } = inferenceChartToCsv(
-                      visibleData,
-                      graph.model,
-                      graph.sequence,
-                    );
-                    exportToCsv(
-                      `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`,
-                      headers,
-                      rows,
-                    );
-                  }}
-                />
-                <Card>
-                  {(() => {
-                    const chartCaption = (
-                      <>
-                        <h2 className="text-lg font-semibold">
-                          {
-                            graph.chartDefinition[
-                              `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
-                            ]
-                          }{' '}
-                          {(() => {
-                            // For Input metrics with dynamic x-axis, use dynamic heading
-                            const metricTitle =
-                              (graph.chartDefinition[
+                        ? 'gpu_timeseries'
+                        : graph.chartDefinition.chartType === 'e2e'
+                          ? 'latency'
+                          : 'interactivity'
+                    }
+                    leadingControls={
+                      <SegmentedToggle
+                        value={getViewMode(graphIndex)}
+                        options={VIEW_MODE_OPTIONS}
+                        onValueChange={(v) => handleViewModeChange(graphIndex, v)}
+                        ariaLabel="View mode"
+                        testId={`inference-view-toggle-${graphIndex}`}
+                      />
+                    }
+                    hideImageExport={getViewMode(graphIndex) === 'table'}
+                    setIsLegendExpanded={setIsLegendExpanded}
+                    exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`}
+                    onExportMp4={
+                      replayAvailable
+                        ? () => replayHandlesRef.current[graphIndex]?.open()
+                        : undefined
+                    }
+                    onExportCsv={() => {
+                      const visibleData = graph.data.filter((d) =>
+                        isTimelineMode
+                          ? activeDates.has(`${d.date}_${d.hwKey}`)
+                          : activeHwTypes.has(d.hwKey as string) &&
+                            selectedPrecisions.includes(d.precision),
+                      );
+                      const { headers, rows } = inferenceChartToCsv(
+                        visibleData,
+                        graph.model,
+                        graph.sequence,
+                      );
+                      exportToCsv(
+                        `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`,
+                        headers,
+                        rows,
+                      );
+                    }}
+                  />
+                  <Card>
+                    {(() => {
+                      const chartCaption = (
+                        <>
+                          <h2 className="text-lg font-semibold">
+                            {
+                              graph.chartDefinition[
                                 `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
-                              ] as string) || '';
-                            const isInputMetric = metricTitle.toLowerCase().includes('input');
-                            if (
-                              graph.chartDefinition.chartType === 'interactivity' &&
-                              isInputMetric &&
-                              selectedXAxisMetric === 'p90_ttft'
-                            ) {
-                              return 'vs. P90 Time To First Token';
-                            }
-
-                            // For e2e chart: heading is driven by the buttons above the
-                            // card. Derived-metric modes win first; otherwise the metric
-                            // carries the percentile prefix (e.g. p90_ttft, median_ttft).
-                            if (graph.chartDefinition.chartType === 'e2e') {
-                              if (selectedXAxisMode === 'session-time') {
-                                return 'vs. Mean Normalized Session Time';
+                              ]
+                            }{' '}
+                            {(() => {
+                              // For Input metrics with dynamic x-axis, use dynamic heading
+                              const metricTitle =
+                                (graph.chartDefinition[
+                                  `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
+                                ] as string) || '';
+                              const isInputMetric = metricTitle.toLowerCase().includes('input');
+                              if (
+                                graph.chartDefinition.chartType === 'interactivity' &&
+                                isInputMetric &&
+                                selectedXAxisMetric === 'p90_ttft'
+                              ) {
+                                return 'vs. P90 Time To First Token';
                               }
-                              if (selectedXAxisMode === 'prefill-tps') {
-                                return 'vs. P90 Prefill TPS / user';
-                              }
-                              const isAgentic = sequenceKind(selectedSequence) === 'agentic';
-                              if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
-                                const pctl = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
-                                const word = pctl === 'median' ? 'Median' : pctl.toUpperCase();
-                                return `vs. ${word} Time To First Token`;
+
+                              // For e2e chart: heading is driven by the buttons above the
+                              // card. Derived-metric modes win first; otherwise the metric
+                              // carries the percentile prefix (e.g. p90_ttft, median_ttft).
+                              if (graph.chartDefinition.chartType === 'e2e') {
+                                if (selectedXAxisMode === 'session-time') {
+                                  return 'vs. Mean Normalized Session Time';
+                                }
+                                if (selectedXAxisMode === 'prefill-tps') {
+                                  return 'vs. P90 Prefill TPS / user';
+                                }
+                                const isAgentic = sequenceKind(selectedSequence) === 'agentic';
+                                if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
+                                  const pctl = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
+                                  const word = pctl === 'median' ? 'Median' : pctl.toUpperCase();
+                                  return `vs. ${word} Time To First Token`;
+                                }
+                                const pctlWord = selectedPercentile.toUpperCase();
+                                return isAgentic
+                                  ? `vs. ${pctlWord} End-to-end Latency`
+                                  : 'vs. End-to-end Latency';
                               }
-                              const pctlWord = selectedPercentile.toUpperCase();
-                              return isAgentic
-                                ? `vs. ${pctlWord} End-to-end Latency`
-                                : 'vs. End-to-end Latency';
-                            }
 
-                            // Fall back to the heading baked into chartDefinition
-                            // by useChartData (already resolves per-metric overrides
-                            // and applies the agentic percentile rewrite).
-                            return graph.chartDefinition.heading;
-                          })()}
-                        </h2>
-                        <p className="text-sm text-muted-foreground mb-2">
-                          {getModelLabel(graph.model as Model)} •{' '}
-                          {selectedPrecisions
-                            .map((prec) => getPrecisionLabel(prec as Precision))
-                            .join(', ')}{' '}
-                          • {getSequenceLabel(graph.sequence as Sequence)} •{' '}
-                          {isUnofficialRun
-                            ? 'Source: UNOFFICIAL'
-                            : 'Source: SemiAnalysis InferenceX™'}
-                          {selectedRunDate && (
-                            <>
-                              {' '}
-                              • Updated:{' '}
-                              {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString(
-                                'en-US',
-                                {
-                                  year: 'numeric',
-                                  month: '2-digit',
-                                  day: '2-digit',
-                                  timeZone: 'UTC',
-                                },
-                              )}
-                            </>
-                          )}
-                        </p>
-                        <MetricAssumptionNotes selectedYAxisMetric={selectedYAxisMetric} />
-                        <UnofficialDomainNotice />
-                      </>
-                    );
-
-                    if (getViewMode(graphIndex) === 'table') {
-                      const overlay =
-                        graph.chartDefinition.chartType === 'e2e'
-                          ? overlayDataByChartType.e2e
-                          : overlayDataByChartType.interactivity;
-                      const overlayRows = (overlay?.data ?? []).filter((p) =>
-                        selectedPrecisions.includes(p.precision),
-                      );
-                      return (
-                        <>
-                          {chartCaption}
-                          <InferenceTable
-                            data={
-                              overlayRows.length > 0 ? [...graph.data, ...overlayRows] : graph.data
-                            }
-                            chartDefinition={graph.chartDefinition}
-                            selectedYAxisMetric={selectedYAxisMetric}
-                          />
+                              // Fall back to the heading baked into chartDefinition
+                              // by useChartData (already resolves per-metric overrides
+                              // and applies the agentic percentile rewrite).
+                              return graph.chartDefinition.heading;
+                            })()}
+                          </h2>
+                          <p className="text-sm text-muted-foreground mb-2">
+                            {getModelLabel(graph.model as Model)} •{' '}
+                            {selectedPrecisions
+                              .map((prec) => getPrecisionLabel(prec as Precision))
+                              .join(', ')}{' '}
+                            • {getSequenceLabel(graph.sequence as Sequence)} •{' '}
+                            {isUnofficialRun
+                              ? 'Source: UNOFFICIAL'
+                              : 'Source: SemiAnalysis InferenceX™'}
+                            {selectedRunDate && (
+                              <>
+                                {' '}
+                                • Updated:{' '}
+                                {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString(
+                                  'en-US',
+                                  {
+                                    year: 'numeric',
+                                    month: '2-digit',
+                                    day: '2-digit',
+                                    timeZone: 'UTC',
+                                  },
+                                )}
+                              </>
+                            )}
+                          </p>
+                          <MetricAssumptionNotes selectedYAxisMetric={selectedYAxisMetric} />
+                          <UnofficialDomainNotice />
                         </>
                       );
-                    }
 
-                    return selectedDateRange.startDate &&
-                      selectedDateRange.endDate &&
-                      selectedGPUs.length > 0 ? (
-                      <GPUGraph
-                        chartId={`chart-${graphIndex}`}
-                        modelLabel={graph.model}
-                        data={graph.data}
-                        xLabel={graph.chartDefinition.x_label}
-                        yLabel={`${
-                          graph.chartDefinition[
-                            `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
-                          ]
-                        }`}
-                        chartDefinition={graph.chartDefinition}
-                        caption={chartCaption}
-                      />
-                    ) : (
-                      <div className="relative">
-                        <ScatterGraph
+                      if (getViewMode(graphIndex) === 'table') {
+                        const overlay =
+                          graph.chartDefinition.chartType === 'e2e'
+                            ? overlayDataByChartType.e2e
+                            : overlayDataByChartType.interactivity;
+                        const overlayRows = (overlay?.data ?? []).filter((p) =>
+                          selectedPrecisions.includes(p.precision),
+                        );
+                        return (
+                          <>
+                            {chartCaption}
+                            <InferenceTable
+                              data={
+                                overlayRows.length > 0
+                                  ? [...graph.data, ...overlayRows]
+                                  : graph.data
+                              }
+                              chartDefinition={graph.chartDefinition}
+                              selectedYAxisMetric={selectedYAxisMetric}
+                            />
+                          </>
+                        );
+                      }
+
+                      return selectedDateRange.startDate &&
+                        selectedDateRange.endDate &&
+                        selectedGPUs.length > 0 ? (
+                        <GPUGraph
                           chartId={`chart-${graphIndex}`}
                           modelLabel={graph.model}
                           data={graph.data}
@@ -579,43 +577,58 @@ export default function ChartDisplay() {
                           }`}
                           chartDefinition={graph.chartDefinition}
                           caption={chartCaption}
-                          overlayData={
-                            graph.chartDefinition.chartType === 'e2e'
-                              ? (overlayDataByChartType.e2e ?? undefined)
-                              : (overlayDataByChartType.interactivity ?? undefined)
-                          }
                         />
-                        {selectedGPUs.length > 0 &&
-                          (!selectedDateRange.startDate || !selectedDateRange.endDate) && (
-                            <div className="absolute inset-0 flex items-center justify-center bg-background/60 backdrop-blur-[2px] rounded-lg z-10">
-                              <p className="text-sm font-medium text-muted-foreground bg-background/90 border border-border rounded-md px-4 py-2 shadow-sm">
-                                Select a date range to view GPU comparison
-                              </p>
-                            </div>
-                          )}
-                      </div>
-                    );
-                  })()}
-                  {replayAvailable && (
-                    <ReplayLauncher
-                      ref={(handle) => {
-                        replayHandlesRef.current[graphIndex] = handle;
-                      }}
-                      parentChartId={`chart-${graphIndex}`}
-                      chartDefinition={graph.chartDefinition}
-                      yLabel={`${
-                        graph.chartDefinition[
-                          `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
-                        ]
-                      }`}
-                      xLabel={graph.chartDefinition.x_label}
-                    />
-                  )}
-                </Card>
-              </figure>
-            </section>
-          );
-        });
+                      ) : (
+                        <div className="relative">
+                          <ScatterGraph
+                            chartId={`chart-${graphIndex}`}
+                            modelLabel={graph.model}
+                            data={graph.data}
+                            xLabel={graph.chartDefinition.x_label}
+                            yLabel={`${
+                              graph.chartDefinition[
+                                `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
+                              ]
+                            }`}
+                            chartDefinition={graph.chartDefinition}
+                            caption={chartCaption}
+                            overlayData={
+                              graph.chartDefinition.chartType === 'e2e'
+                                ? (overlayDataByChartType.e2e ?? undefined)
+                                : (overlayDataByChartType.interactivity ?? undefined)
+                            }
+                          />
+                          {selectedGPUs.length > 0 &&
+                            (!selectedDateRange.startDate || !selectedDateRange.endDate) && (
+                              <div className="absolute inset-0 flex items-center justify-center bg-background/60 backdrop-blur-[2px] rounded-lg z-10">
+                                <p className="text-sm font-medium text-muted-foreground bg-background/90 border border-border rounded-md px-4 py-2 shadow-sm">
+                                  Select a date range to view GPU comparison
+                                </p>
+                              </div>
+                            )}
+                        </div>
+                      );
+                    })()}
+                    {replayAvailable && (
+                      <ReplayLauncher
+                        ref={(handle) => {
+                          replayHandlesRef.current[graphIndex] = handle;
+                        }}
+                        parentChartId={`chart-${graphIndex}`}
+                        chartDefinition={graph.chartDefinition}
+                        yLabel={`${
+                          graph.chartDefinition[
+                            `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
+                          ]
+                        }`}
+                        xLabel={graph.chartDefinition.x_label}
+                      />
+                    )}
+                  </Card>
+                </figure>
+              </section>
+            );
+          });
 
   return (
     <div data-testid="inference-chart-display" className="flex flex-col gap-4">

From d5dbda773ef653d715cb1d0634c2b70cc94a826f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 15:40:32 -0500
Subject: [PATCH 35/55] feat(agentic-detail): aggregates-across-configs view

Adds a 'Per-point / Aggregates across configs' toggle near the SKU header
on /inference/agentic/[id]. The aggregates view replaces the per-point
charts with four multi-line charts (ISL, OSL, KV cache util, prefix cache
hit rate) showing how mean/P50/P75/P90/P99 vary across every sibling
config in the SKU. X-axis is sibling labels matching SiblingNav chips
(parallelism + concurrency); each percentile gets its own colored line.

Plumbing:
- `getAgenticAggregates(sql, ids)` in packages/db parses both the
  profile_export.jsonl (per-request ISL/OSL) and the server_metrics_json
  (KV cache util + prefix hit rate time-series) per id, computes the five
  percentiles. 6-case unit suite covers percentile math, JSONL parsing,
  and the prefix-hit derivation.
- /api/v1/agentic-aggregates blob-cached like trace-histograms.
- New `useAgenticAggregates` hook + new AggregateChart component (multi-
  line with hover + ExpandableChart parity).

Memory + transport handling:
- Each row pulls TWO compressed blobs and `server_metrics_json_gz` can be
  up to ~17 MB compressed per high-conc row. Chunked query at size 2
  keeps each Neon HTTP response under the 64 MB cap and limits Node heap
  to ~one chunk's worth of decompressed JSON at a time (parallel chunks
  OOM'd on a 12-sibling SKU).
- Slow path runs ~20s on a 12-sibling SKU; cached afterwards (blobOnly).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../app/api/v1/agentic-aggregates/route.ts    |  64 +++
 .../agentic-point/agentic-point-detail.tsx    | 479 ++++++++++++------
 .../agentic-point/aggregate-chart.tsx         | 230 +++++++++
 .../inference/agentic-point/sibling-nav.tsx   |   2 +-
 .../src/hooks/api/use-agentic-aggregates.ts   |  45 ++
 .../db/src/queries/agentic-aggregates.test.ts | 113 +++++
 packages/db/src/queries/agentic-aggregates.ts | 255 ++++++++++
 7 files changed, 1020 insertions(+), 168 deletions(-)
 create mode 100644 packages/app/src/app/api/v1/agentic-aggregates/route.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
 create mode 100644 packages/app/src/hooks/api/use-agentic-aggregates.ts
 create mode 100644 packages/db/src/queries/agentic-aggregates.test.ts
 create mode 100644 packages/db/src/queries/agentic-aggregates.ts

diff --git a/packages/app/src/app/api/v1/agentic-aggregates/route.ts b/packages/app/src/app/api/v1/agentic-aggregates/route.ts
new file mode 100644
index 00000000..63cb2dc0
--- /dev/null
+++ b/packages/app/src/app/api/v1/agentic-aggregates/route.ts
@@ -0,0 +1,64 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getAgenticAggregates,
+  type AgenticAggregateMap,
+} from '@semianalysisai/inferencex-db/queries/agentic-aggregates';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+// blobOnly: response stays small (a few numbers per id), but generating it
+// parses ~5-10 MB of decompressed JSONL + JSON per id. Cache so the
+// "Aggregates" toggle stays snappy.
+const getCachedAgenticAggregates = cachedQuery(
+  (ids: number[]): Promise<AgenticAggregateMap> => getAgenticAggregates(getDb(), ids),
+  'agentic-aggregates',
+  { blobOnly: true },
+);
+
+const MAX_IDS_PER_REQUEST = 200;
+
+/**
+ * GET /api/v1/agentic-aggregates?ids=1,2,3
+ *
+ * Returns per-id mean/p50/p75/p90/p99 for ISL, OSL, KV cache utilization,
+ * and prefix cache hit rate — computed live from the stored aiperf
+ * profile_export.jsonl + server_metrics_json blobs. Ids without a
+ * trace_replay blob (or with no usable samples) get nulls.
+ */
+export async function GET(request: NextRequest) {
+  const raw = request.nextUrl.searchParams.get('ids');
+  if (!raw) {
+    return NextResponse.json({ error: 'ids query param is required' }, { status: 400 });
+  }
+
+  const ids = [
+    ...new Set(
+      raw
+        .split(',')
+        .map((s) => Number(s.trim()))
+        .filter((n) => Number.isFinite(n) && n > 0),
+    ),
+  ];
+  if (ids.length === 0) {
+    return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 });
+  }
+  if (ids.length > MAX_IDS_PER_REQUEST) {
+    return NextResponse.json(
+      { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` },
+      { status: 400 },
+    );
+  }
+
+  try {
+    const sorted = [...ids].toSorted((a, b) => a - b);
+    const result = await getCachedAgenticAggregates(sorted);
+    return cachedJson(result);
+  } catch (error) {
+    console.error('Error fetching agentic aggregates:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index ee58332d..a5bca4e0 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -2,8 +2,10 @@
 
 import Link from 'next/link';
 import { useRouter } from 'next/navigation';
+import { useState } from 'react';
 import { ArrowLeft } from 'lucide-react';
 
+import { useAgenticAggregates, type AgenticAggregateMap } from '@/hooks/api/use-agentic-aggregates';
 import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
 import {
   useTraceServerMetrics,
@@ -12,10 +14,12 @@ import {
   type TimeSeriesPoint,
 } from '@/hooks/api/use-trace-server-metrics';
 import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
 
+import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart';
 import { Distribution } from './distribution';
 import { ExpandableChart } from './expandable-chart';
-import { SiblingNav } from './sibling-nav';
+import { SiblingNav, chipLabel } from './sibling-nav';
 import {
   StackedAreaChart,
   TimeSeriesChart,
@@ -78,6 +82,28 @@ const CHART_SIZES = {
   expanded: { width: 1300, height: 520 },
 };
 
+type DetailView = 'point' | 'aggregates';
+const VIEW_OPTIONS: SegmentedToggleOption<DetailView>[] = [
+  { value: 'point', label: 'Per-point', testId: 'detail-view-point' },
+  { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' },
+];
+
+/** Bundle per-percentile values for one sibling into the shape AggregateChart wants. */
+function toAggPoint(
+  sibling: { id: number; label: string },
+  pct: { mean: number; p50: number; p75: number; p90: number; p99: number } | null | undefined,
+): AggregatePoint {
+  const values: Partial<Record<PercentileKey, number>> = {};
+  if (pct) {
+    values.mean = pct.mean;
+    values.p50 = pct.p50;
+    values.p75 = pct.p75;
+    values.p90 = pct.p90;
+    values.p99 = pct.p99;
+  }
+  return { id: sibling.id, label: sibling.label, values };
+}
+
 export function AgenticPointDetail({ id }: Props) {
   const router = useRouter();
   const histQuery = useTraceHistograms([id], true);
@@ -88,6 +114,13 @@ export function AgenticPointDetail({ id }: Props) {
   const metrics = metricsQuery.data;
   const siblingsData = siblingsQuery.data;
 
+  const [view, setView] = useState<DetailView>('point');
+  // Fetch aggregates only when the aggregates view is active. Uses the full
+  // sibling set (across parallelism + concurrency configs) so each chart
+  // shows how the metric varies across the SKU.
+  const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? [];
+  const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates');
+
   return (
     <div className="container mx-auto px-4 lg:px-8 flex flex-col gap-4 py-6">
       <div className="flex items-center gap-2">
@@ -128,180 +161,292 @@ export function AgenticPointDetail({ id }: Props) {
         </div>
       )}
 
-      <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
-        <ExpandableChart
-          title="Input sequence length distribution"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (hist) return <Distribution values={hist.isl} unit="tokens" {...size} />;
-            return histQuery.isLoading ? <Skeleton /> : <Empty />;
-          }}
-        />
-        <ExpandableChart
-          title="Output sequence length distribution"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (hist) return <Distribution values={hist.osl} unit="tokens" {...size} />;
-            return histQuery.isLoading ? <Skeleton /> : <Empty />;
-          }}
+      <div className="flex items-center justify-between gap-3">
+        <SegmentedToggle
+          value={view}
+          options={VIEW_OPTIONS}
+          onValueChange={setView}
+          ariaLabel="Detail view"
+          testId="detail-view-toggle"
+          buttonClassName="px-3 py-1.5 text-sm"
         />
+        {view === 'aggregates' && (
+          <span className="text-xs text-muted-foreground">
+            {siblingIds.length} configs in SKU
+            {aggregatesQuery.isLoading ? ' · loading…' : ''}
+          </span>
+        )}
+      </div>
 
-        <ExpandableChart
-          title="KV cache utilization over time"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (!metrics) return <Skeleton />;
-            return (
-              <TimeSeriesChart
-                series={[
-                  {
-                    name: 'GPU KV cache (avg n=50)',
-                    data: rollingAverage(metrics.kvCacheUsage, 50),
-                    rawData: metrics.kvCacheUsage,
-                    color: '#3b82f6',
-                    strokeWidth: 2,
-                  },
-                ]}
-                durationS={metrics.durationS}
-                yMax={1}
-                yFmt={(v) => `${(v * 100).toFixed(0)}%`}
-                yAxisLabel="KV cache (%)"
-                {...size}
-              />
-            );
-          }}
+      {view === 'aggregates' ? (
+        <AggregatesGrid
+          siblings={siblingsData?.siblings ?? []}
+          aggregates={aggregatesQuery.data}
+          isLoading={aggregatesQuery.isLoading}
         />
+      ) : (
+        <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
+          <ExpandableChart
+            title="Input sequence length distribution"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (hist) return <Distribution values={hist.isl} unit="tokens" {...size} />;
+              return histQuery.isLoading ? <Skeleton /> : <Empty />;
+            }}
+          />
+          <ExpandableChart
+            title="Output sequence length distribution"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (hist) return <Distribution values={hist.osl} unit="tokens" {...size} />;
+              return histQuery.isLoading ? <Skeleton /> : <Empty />;
+            }}
+          />
 
-        <ExpandableChart
-          title="Request queue depth"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (!metrics) return <Skeleton />;
-            return (
-              <TimeSeriesChart
-                series={[
-                  {
-                    name: 'Running (avg n=50)',
-                    data: rollingAverage(
-                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                        t: p.t,
-                        value: p.running,
-                      })),
-                      50,
-                    ),
-                    color: '#22c55e',
-                    strokeWidth: 2,
-                  },
-                  {
-                    name: 'Waiting (avg n=50)',
-                    data: rollingAverage(
-                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                        t: p.t,
-                        value: p.waiting,
-                      })),
-                      50,
-                    ),
-                    color: '#ef4444',
-                    strokeWidth: 2,
-                  },
-                  {
-                    name: 'Total (avg n=50)',
-                    data: rollingAverage(
-                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                        t: p.t,
-                        value: p.total,
-                      })),
-                      50,
-                    ),
-                    color: '#3b82f6',
-                    strokeWidth: 2,
-                  },
-                ]}
-                durationS={metrics.durationS}
-                yAxisLabel="Requests"
-                {...size}
-              />
-            );
-          }}
-        />
+          <ExpandableChart
+            title="KV cache utilization over time"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'GPU KV cache (avg n=50)',
+                      data: rollingAverage(metrics.kvCacheUsage, 50),
+                      rawData: metrics.kvCacheUsage,
+                      color: '#3b82f6',
+                      strokeWidth: 2,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yMax={1}
+                  yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+                  yAxisLabel="KV cache (%)"
+                  {...size}
+                />
+              );
+            }}
+          />
 
-        <ExpandableChart
-          title="Prefix cache hit rate per interval"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (!metrics) return <Skeleton />;
-            return (
-              <TimeSeriesChart
-                series={[
-                  {
-                    name: 'GPU (HBM, avg n=50)',
-                    data: rollingAverage(metrics.prefixCacheHitRate, 50),
-                    rawData: metrics.prefixCacheHitRate,
-                    color: '#a855f7',
-                    strokeWidth: 2,
-                  },
-                ]}
-                durationS={metrics.durationS}
-                yMax={1}
-                yFmt={(v) => `${(v * 100).toFixed(0)}%`}
-                yAxisLabel="Hit rate (%)"
-                {...size}
-              />
-            );
-          }}
-        />
+          <ExpandableChart
+            title="Request queue depth"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'Running (avg n=50)',
+                      data: rollingAverage(
+                        metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                          t: p.t,
+                          value: p.running,
+                        })),
+                        50,
+                      ),
+                      color: '#22c55e',
+                      strokeWidth: 2,
+                    },
+                    {
+                      name: 'Waiting (avg n=50)',
+                      data: rollingAverage(
+                        metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                          t: p.t,
+                          value: p.waiting,
+                        })),
+                        50,
+                      ),
+                      color: '#ef4444',
+                      strokeWidth: 2,
+                    },
+                    {
+                      name: 'Total (avg n=50)',
+                      data: rollingAverage(
+                        metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                          t: p.t,
+                          value: p.total,
+                        })),
+                        50,
+                      ),
+                      color: '#3b82f6',
+                      strokeWidth: 2,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yAxisLabel="Requests"
+                  {...size}
+                />
+              );
+            }}
+          />
 
-        <ExpandableChart
-          title="Throughput (total & decode)"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (!metrics) return <Skeleton />;
-            const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
-            return (
-              <TimeSeriesChart
-                series={[
-                  {
-                    name: 'Total (avg n=50)',
-                    data: rollingAverage(total, 50),
-                    color: '#3b82f6',
-                    strokeWidth: 1.6,
-                  },
-                  {
-                    name: 'Decode (avg n=50)',
-                    data: rollingAverage(metrics.decodeTps, 50),
-                    color: '#f97316',
-                    strokeWidth: 1.6,
-                  },
-                  {
-                    name: 'Total running avg',
-                    data: cumulativeAverage(total),
-                    color: '#ef4444',
-                    strokeWidth: 3,
-                  },
-                ]}
-                durationS={metrics.durationS}
-                yAxisLabel="Tokens / sec"
-                {...size}
-              />
-            );
-          }}
-        />
+          <ExpandableChart
+            title="Prefix cache hit rate per interval"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'GPU (HBM, avg n=50)',
+                      data: rollingAverage(metrics.prefixCacheHitRate, 50),
+                      rawData: metrics.prefixCacheHitRate,
+                      color: '#a855f7',
+                      strokeWidth: 2,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yMax={1}
+                  yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+                  yAxisLabel="Hit rate (%)"
+                  {...size}
+                />
+              );
+            }}
+          />
 
-        <ExpandableChart
-          title="Cumulative prompt token source breakdown"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (!metrics) return <Skeleton />;
-            return (
-              <StackedAreaChart
-                sourceSeries={metrics.promptTokensBySource}
-                durationS={metrics.durationS}
-                {...size}
-              />
-            );
-          }}
-        />
+          <ExpandableChart
+            title="Throughput (total & decode)"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'Total (avg n=50)',
+                      data: rollingAverage(total, 50),
+                      color: '#3b82f6',
+                      strokeWidth: 1.6,
+                    },
+                    {
+                      name: 'Decode (avg n=50)',
+                      data: rollingAverage(metrics.decodeTps, 50),
+                      color: '#f97316',
+                      strokeWidth: 1.6,
+                    },
+                    {
+                      name: 'Total running avg',
+                      data: cumulativeAverage(total),
+                      color: '#ef4444',
+                      strokeWidth: 3,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yAxisLabel="Tokens / sec"
+                  {...size}
+                />
+              );
+            }}
+          />
+
+          <ExpandableChart
+            title="Cumulative prompt token source breakdown"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              return (
+                <StackedAreaChart
+                  sourceSeries={metrics.promptTokensBySource}
+                  durationS={metrics.durationS}
+                  {...size}
+                />
+              );
+            }}
+          />
+        </div>
+      )}
+    </div>
+  );
+}
+
+function AggregatesGrid({
+  siblings,
+  aggregates,
+  isLoading,
+}: {
+  siblings: {
+    id: number;
+    conc: number;
+    decode_tp: number;
+    decode_ep: number;
+    disagg: boolean;
+    num_prefill_gpu: number;
+    num_decode_gpu: number;
+    offload_mode?: string | null;
+  }[];
+  aggregates: AgenticAggregateMap | undefined;
+  isLoading: boolean;
+}) {
+  if (siblings.length === 0) {
+    return (
+      <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+        SKU sibling list not loaded yet — open a point to populate.
       </div>
+    );
+  }
+  if (isLoading && !aggregates) {
+    return (
+      <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+        Computing aggregates across {siblings.length} configs… (parsing trace blobs)
+      </div>
+    );
+  }
+  const labeled = siblings.map((s) => ({ id: s.id, label: chipLabel(s as any) }));
+  const islPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.isl));
+  const oslPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.osl));
+  const kvPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.kvCacheUtil));
+  const prefixPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.prefixCacheHitRate));
+  return (
+    <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
+      <ExpandableChart
+        title="ISL distribution (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={islPoints}
+            unit="tokens"
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
+      <ExpandableChart
+        title="OSL distribution (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={oslPoints}
+            unit="tokens"
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
+      <ExpandableChart
+        title="KV cache utilization (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={kvPoints}
+            unit="%"
+            yMax={1}
+            yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
+      <ExpandableChart
+        title="Prefix cache hit rate (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={prefixPoints}
+            unit="%"
+            yMax={1}
+            yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
     </div>
   );
 }
diff --git a/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
new file mode 100644
index 00000000..446677ad
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
@@ -0,0 +1,230 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import { ChartHover, type HoverItem } from './chart-hover';
+
+export type PercentileKey = 'mean' | 'p50' | 'p75' | 'p90' | 'p99';
+
+interface PercentileLine {
+  key: PercentileKey;
+  /** Display label in legend / tooltip. */
+  label: string;
+  color: string;
+}
+
+const PERCENTILE_LINES: PercentileLine[] = [
+  { key: 'mean', label: 'Mean', color: '#ef4444' },
+  { key: 'p50', label: 'P50', color: '#3b82f6' },
+  { key: 'p75', label: 'P75', color: '#22c55e' },
+  { key: 'p90', label: 'P90', color: '#f59e0b' },
+  { key: 'p99', label: 'P99', color: '#a855f7' },
+];
+
+export interface AggregatePoint {
+  /** Sibling label rendered on x-axis (e.g. "TP8 • c=8"). */
+  label: string;
+  /** Per-percentile value; missing percentiles are dropped from the plot. */
+  values: Partial<Record<PercentileKey, number>>;
+  /** Sibling id — purely informational, used in the tooltip title. */
+  id?: number;
+}
+
+/**
+ * Multi-line chart: one x-position per sibling config, one line per
+ * percentile (mean/p50/p75/p90/p99). Designed for the "Aggregates across
+ * configs" view on the agentic detail page.
+ */
+export function AggregateChart({
+  points,
+  unit,
+  yMax,
+  yFmt,
+  width = 720,
+  height = 320,
+}: {
+  points: readonly AggregatePoint[];
+  unit: string;
+  /** Optional fixed y-axis upper bound (e.g. 1 for percentages). */
+  yMax?: number;
+  /** Optional value formatter (e.g. percentage → "30%"). */
+  yFmt?: (v: number) => string;
+  width?: number;
+  height?: number;
+}) {
+  const W = width;
+  const H = height;
+  const PAD = { top: 16, right: 16, bottom: 90, left: 64 };
+  const fmt = (v: number) =>
+    yFmt
+      ? yFmt(v)
+      : v >= 10000
+        ? new Intl.NumberFormat('en-US').format(Math.round(v))
+        : v.toFixed(v < 10 ? 2 : 0);
+
+  const computed = useMemo(() => {
+    if (points.length === 0) return null;
+    let yMaxComputed = 0;
+    for (const p of points) {
+      for (const line of PERCENTILE_LINES) {
+        const v = p.values[line.key];
+        if (typeof v === 'number' && Number.isFinite(v) && v > yMaxComputed) yMaxComputed = v;
+      }
+    }
+    const yTop = yMax ?? (yMaxComputed === 0 ? 1 : yMaxComputed * 1.05);
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+    return { yTop, innerW, innerH };
+  }, [points, W, H, PAD.left, PAD.right, PAD.top, PAD.bottom, yMax]);
+
+  if (!computed) {
+    return (
+      <div className="grid place-items-center text-xs text-muted-foreground" style={{ height: H }}>
+        No data
+      </div>
+    );
+  }
+  const { yTop, innerW, innerH } = computed;
+
+  // X positions: evenly spaced across the inner width.
+  const xOf = (i: number) =>
+    points.length === 1 ? PAD.left + innerW / 2 : PAD.left + (i / (points.length - 1)) * innerW;
+  const yOf = (v: number) => PAD.top + (1 - v / yTop) * innerH;
+
+  // 5 y-axis ticks evenly between 0 and yTop.
+  const yTicks = Array.from({ length: 5 }, (_, i) => (yTop * i) / 4);
+
+  // Resolve hover: snap to nearest sibling index and emit all percentiles
+  // that have data at that x.
+  const resolve = (fraction: number) => {
+    const idx = Math.round(fraction * (points.length - 1));
+    const p = points[Math.max(0, Math.min(points.length - 1, idx))];
+    if (!p) return null;
+    const items: HoverItem[] = [];
+    for (const line of PERCENTILE_LINES) {
+      const v = p.values[line.key];
+      if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+      items.push({ color: line.color, label: line.label, value: fmt(v) });
+    }
+    return { items, title: p.label };
+  };
+
+  return (
+    <div className="w-full">
+      <div className="mb-2 flex flex-wrap items-center gap-x-3 gap-y-1 text-xs">
+        {PERCENTILE_LINES.map((line) => (
+          <div key={line.key} className="flex items-center gap-1.5">
+            <span className="inline-block w-3 h-0.5" style={{ backgroundColor: line.color }} />
+            <span className="text-muted-foreground">{line.label}</span>
+          </div>
+        ))}
+        <span className="ml-auto text-muted-foreground">
+          {points.length} configs · units: {unit}
+        </span>
+      </div>
+      <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+        {/* y-axis ticks + gridlines */}
+        {yTicks.map((v, i) => {
+          const y = yOf(v);
+          return (
+            <g key={`y${i}`}>
+              <line
+                x1={PAD.left}
+                x2={PAD.left + innerW}
+                y1={y}
+                y2={y}
+                stroke="currentColor"
+                opacity={0.08}
+              />
+              <text
+                x={PAD.left - 8}
+                y={y + 3}
+                fontSize={10}
+                fill="currentColor"
+                opacity={0.55}
+                textAnchor="end"
+              >
+                {fmt(v)}
+              </text>
+            </g>
+          );
+        })}
+
+        {/* X-axis tick labels — one per sibling, rotated 30° to fit. */}
+        {points.map((p, i) => {
+          const x = xOf(i);
+          return (
+            <g key={`x${i}`}>
+              <line
+                x1={x}
+                x2={x}
+                y1={PAD.top + innerH}
+                y2={PAD.top + innerH + 4}
+                stroke="currentColor"
+                opacity={0.4}
+              />
+              <text
+                x={x}
+                y={PAD.top + innerH + 8}
+                fontSize={10}
+                fill="currentColor"
+                opacity={0.7}
+                textAnchor="end"
+                transform={`rotate(-30 ${x} ${PAD.top + innerH + 8})`}
+              >
+                {p.label}
+              </text>
+            </g>
+          );
+        })}
+
+        {/* X axis baseline */}
+        <line
+          x1={PAD.left}
+          x2={PAD.left + innerW}
+          y1={PAD.top + innerH}
+          y2={PAD.top + innerH}
+          stroke="currentColor"
+          opacity={0.25}
+        />
+
+        {/* Percentile polylines + markers */}
+        {PERCENTILE_LINES.map((line) => {
+          const segments: { x1: number; y1: number; x2: number; y2: number }[] = [];
+          const markers: { x: number; y: number }[] = [];
+          let prev: { x: number; y: number } | null = null;
+          for (let i = 0; i < points.length; i++) {
+            const v = points[i]!.values[line.key];
+            if (typeof v !== 'number' || !Number.isFinite(v)) {
+              prev = null;
+              continue;
+            }
+            const x = xOf(i);
+            const y = yOf(v);
+            markers.push({ x, y });
+            if (prev) segments.push({ x1: prev.x, y1: prev.y, x2: x, y2: y });
+            prev = { x, y };
+          }
+          return (
+            <g key={line.key}>
+              {segments.map((s, j) => (
+                <line
+                  key={`s${j}`}
+                  x1={s.x1}
+                  y1={s.y1}
+                  x2={s.x2}
+                  y2={s.y2}
+                  stroke={line.color}
+                  strokeWidth={1.5}
+                />
+              ))}
+              {markers.map((m, j) => (
+                <circle key={`m${j}`} cx={m.x} cy={m.y} r={3} fill={line.color} />
+              ))}
+            </g>
+          );
+        })}
+      </ChartHover>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
index 776c8ba2..aa727fdc 100644
--- a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
+++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
@@ -48,7 +48,7 @@ function frameworkLabel(fw: string) {
 }
 
 /** Short label for a sibling chip: parallelism + concurrency. */
-function chipLabel(s: BenchmarkSibling): string {
+export function chipLabel(s: BenchmarkSibling): string {
   const parallel = s.disagg
     ? `${s.num_prefill_gpu}P+${s.num_decode_gpu}D`
     : `TP${s.decode_tp}${s.decode_ep > 1 ? `EP${s.decode_ep}` : ''}`;
diff --git a/packages/app/src/hooks/api/use-agentic-aggregates.ts b/packages/app/src/hooks/api/use-agentic-aggregates.ts
new file mode 100644
index 00000000..4ca25ee2
--- /dev/null
+++ b/packages/app/src/hooks/api/use-agentic-aggregates.ts
@@ -0,0 +1,45 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface MetricPercentiles {
+  mean: number;
+  p50: number;
+  p75: number;
+  p90: number;
+  p99: number;
+  n: number;
+}
+
+export interface AgenticAggregate {
+  id: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+}
+
+export type AgenticAggregateMap = Record<number, AgenticAggregate>;
+
+async function fetchAgenticAggregates(
+  ids: number[],
+  signal?: AbortSignal,
+): Promise<AgenticAggregateMap> {
+  if (ids.length === 0) return {};
+  const res = await fetch(`/api/v1/agentic-aggregates?ids=${ids.join(',')}`, { signal });
+  if (!res.ok) throw new Error(`agentic-aggregates ${res.status}`);
+  return (await res.json()) as AgenticAggregateMap;
+}
+
+/**
+ * Fetch per-id aggregate stats (mean/p50/p75/p90/p99) for ISL, OSL, KV
+ * cache utilization, and prefix cache hit rate. Used by the "Aggregates
+ * across configs" view on the agentic detail page.
+ */
+export function useAgenticAggregates(ids: number[], enabled = true) {
+  const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+  return useQuery({
+    queryKey: ['agentic-aggregates', sortedKey.join(',')] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) => fetchAgenticAggregates(sortedKey, signal),
+    enabled: enabled && sortedKey.length > 0,
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/db/src/queries/agentic-aggregates.test.ts b/packages/db/src/queries/agentic-aggregates.test.ts
new file mode 100644
index 00000000..2a0305bf
--- /dev/null
+++ b/packages/db/src/queries/agentic-aggregates.test.ts
@@ -0,0 +1,113 @@
+import { describe, expect, it } from 'vitest';
+
+import { extractIslOsl, extractServerMetricSamples, percentilesOf } from './agentic-aggregates.js';
+
+describe('percentilesOf', () => {
+  it('returns null for empty input', () => {
+    expect(percentilesOf([])).toBeNull();
+    expect(percentilesOf([Number.NaN, Number.POSITIVE_INFINITY])).toBeNull();
+  });
+
+  it('computes percentiles for a simple integer range', () => {
+    // 1..100, evenly spaced — linear quantile is straightforward.
+    const xs = Array.from({ length: 100 }, (_, i) => i + 1);
+    const p = percentilesOf(xs);
+    expect(p).not.toBeNull();
+    expect(p!.n).toBe(100);
+    expect(p!.mean).toBeCloseTo(50.5, 6);
+    expect(p!.p50).toBeCloseTo(50.5, 6);
+    // For 100 sorted values, p75 = sorted[0.75 * 99] = sorted[74.25] interp.
+    expect(p!.p75).toBeCloseTo(75.25, 6);
+    expect(p!.p90).toBeCloseTo(90.1, 6);
+    expect(p!.p99).toBeCloseTo(99.01, 6);
+  });
+
+  it('filters out non-finite values before computing', () => {
+    const p = percentilesOf([1, 2, Number.NaN, 3, Number.POSITIVE_INFINITY, 4]);
+    expect(p?.n).toBe(4);
+    expect(p?.mean).toBeCloseTo(2.5, 6);
+  });
+});
+
+describe('extractIslOsl', () => {
+  it('reads input/output sequence length from profiling records', () => {
+    const lines = [
+      JSON.stringify({
+        metadata: { benchmark_phase: 'profiling' },
+        metrics: {
+          input_sequence_length: { value: 100, unit: 'tokens' },
+          output_sequence_length: { value: 50, unit: 'tokens' },
+        },
+      }),
+      JSON.stringify({
+        metadata: { benchmark_phase: 'profiling' },
+        metrics: {
+          input_sequence_length: { value: 200, unit: 'tokens' },
+          output_sequence_length: { value: 75, unit: 'tokens' },
+        },
+      }),
+      // warmup record — should be ignored
+      JSON.stringify({
+        metadata: { benchmark_phase: 'warmup' },
+        metrics: {
+          input_sequence_length: { value: 9999, unit: 'tokens' },
+          output_sequence_length: { value: 9999, unit: 'tokens' },
+        },
+      }),
+    ];
+    const { isl, osl } = extractIslOsl(lines.join('\n'));
+    expect(isl).toEqual([100, 200]);
+    expect(osl).toEqual([50, 75]);
+  });
+});
+
+describe('extractServerMetricSamples', () => {
+  it('extracts KV cache util gauge and computes per-interval prefix hit rate', () => {
+    const json = JSON.stringify({
+      metrics: {
+        'vllm:kv_cache_usage_perc': {
+          series: [
+            {
+              timeslices: [
+                { start_ns: 0, end_ns: 1, avg: 0.1 },
+                { start_ns: 1, end_ns: 2, avg: 0.5 },
+                { start_ns: 2, end_ns: 3, avg: 0.9 },
+              ],
+            },
+          ],
+        },
+        'vllm:gpu_prefix_cache_hits': {
+          series: [
+            {
+              timeslices: [
+                { start_ns: 0, rate: 80 },
+                { start_ns: 1, rate: 50 },
+                { start_ns: 2, rate: 0 }, // skipped because matching queries.rate is 0
+              ],
+            },
+          ],
+        },
+        'vllm:gpu_prefix_cache_queries': {
+          series: [
+            {
+              timeslices: [
+                { start_ns: 0, rate: 100 }, // hit rate = 0.8
+                { start_ns: 1, rate: 100 }, // hit rate = 0.5
+                { start_ns: 2, rate: 0 },
+              ],
+            },
+          ],
+        },
+      },
+    });
+    const { kvCacheUtil, prefixCacheHitRate } = extractServerMetricSamples(json);
+    expect(kvCacheUtil).toEqual([0.1, 0.5, 0.9]);
+    expect(prefixCacheHitRate).toEqual([0.8, 0.5]);
+  });
+
+  it('returns empty arrays when the JSON lacks the expected metric series', () => {
+    const out = extractServerMetricSamples(JSON.stringify({ metrics: {} }));
+    expect(out.kvCacheUtil).toEqual([]);
+    expect(out.prefixCacheHitRate).toEqual([]);
+  });
+});
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
new file mode 100644
index 00000000..49ae6900
--- /dev/null
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -0,0 +1,255 @@
+/**
+ * Per-id aggregate stats for the "Aggregates across configs" view on the
+ * agentic detail page. Each id contributes one summary number per metric per
+ * percentile so the frontend can plot how each metric varies across the
+ * SKU's parallelism + concurrency configs.
+ *
+ * Sources:
+ *  - `profile_export.jsonl` → ISL / OSL per request (filtered to profiling phase)
+ *  - `server_metrics_json` → time-series of KV cache utilization +
+ *     prefix-cache hit rate per scrape interval
+ *
+ * Returns mean/p50/p75/p90/p99 per metric. Nulls when the blob is missing
+ * or has no usable samples — frontend treats those as "no data".
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import type { DbClient } from '../connection.js';
+
+export interface MetricPercentiles {
+  mean: number;
+  p50: number;
+  p75: number;
+  p90: number;
+  p99: number;
+  /** Sample count used to compute the percentiles. */
+  n: number;
+}
+
+export interface AgenticAggregate {
+  id: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+}
+
+export type AgenticAggregateMap = Record<number, AgenticAggregate>;
+
+/**
+ * Each row pulls TWO compressed blobs (profile_export + server_metrics).
+ * `server_metrics_json_gz` can be up to ~17 MB compressed for high-conc
+ * runs, so even 3 rows can clear Neon's 64 MB cap. Stay conservative at 2.
+ * Chunks are issued in parallel below, so the wall-clock impact is small.
+ */
+const QUERY_CHUNK_SIZE = 2;
+
+/** Linear-interpolated percentile (matches numpy default). */
+function quantile(sortedAsc: number[], q: number): number {
+  if (sortedAsc.length === 0) return Number.NaN;
+  if (sortedAsc.length === 1) return sortedAsc[0]!;
+  const pos = (sortedAsc.length - 1) * q;
+  const lo = Math.floor(pos);
+  const hi = Math.ceil(pos);
+  if (lo === hi) return sortedAsc[lo]!;
+  return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo);
+}
+
+function meanOf(xs: number[]): number {
+  let s = 0;
+  for (const x of xs) s += x;
+  return s / xs.length;
+}
+
+/** Compute the percentile bundle for an array of samples; null if empty. */
+export function percentilesOf(samples: number[]): MetricPercentiles | null {
+  const clean = samples.filter((v) => Number.isFinite(v));
+  if (clean.length === 0) return null;
+  const sorted = [...clean].toSorted((a, b) => a - b);
+  return {
+    mean: meanOf(sorted),
+    p50: quantile(sorted, 0.5),
+    p75: quantile(sorted, 0.75),
+    p90: quantile(sorted, 0.9),
+    p99: quantile(sorted, 0.99),
+    n: sorted.length,
+  };
+}
+
+/** Pull a numeric metric out of the {value, unit} envelope (or a bare number). */
+function readNum(v: unknown): number | undefined {
+  if (typeof v === 'number') return v;
+  if (v && typeof v === 'object' && 'value' in v) {
+    const inner = (v as { value?: unknown }).value;
+    if (typeof inner === 'number' && Number.isFinite(inner)) return inner;
+  }
+  return undefined;
+}
+
+interface ProfileRecord {
+  metadata?: { benchmark_phase?: string };
+  metrics?: {
+    input_sequence_length?: { value?: number } | number;
+    output_sequence_length?: { value?: number } | number;
+  };
+}
+
+/** Parse the profile_export.jsonl → per-request ISL + OSL arrays. */
+export function extractIslOsl(jsonl: string): { isl: number[]; osl: number[] } {
+  const isl: number[] = [];
+  const osl: number[] = [];
+  for (const line of jsonl.split('\n')) {
+    if (!line) continue;
+    let rec: ProfileRecord;
+    try {
+      rec = JSON.parse(line) as ProfileRecord;
+    } catch {
+      continue;
+    }
+    if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue;
+    const m = rec.metrics ?? {};
+    const i = readNum(m.input_sequence_length);
+    const o = readNum(m.output_sequence_length);
+    if (typeof i === 'number') isl.push(i);
+    if (typeof o === 'number') osl.push(o);
+  }
+  return { isl, osl };
+}
+
+interface TimeSlice {
+  start_ns?: number;
+  end_ns?: number;
+  avg?: number;
+  rate?: number;
+  count?: number;
+  sum?: number;
+}
+interface Series {
+  labels?: Record<string, string>;
+  timeslices?: TimeSlice[];
+}
+interface MetricMeta {
+  series?: Series[];
+}
+interface MetricsJson {
+  metrics?: Record<string, MetricMeta>;
+}
+
+/**
+ * Parse the server_metrics_json → time-series arrays for KV cache util and
+ * prefix cache hit rate (per-interval, computed from the prometheus
+ * counters the same way trace-server-metrics does it).
+ */
+export function extractServerMetricSamples(json: string): {
+  kvCacheUtil: number[];
+  prefixCacheHitRate: number[];
+} {
+  const parsed = JSON.parse(json) as MetricsJson;
+  const metrics = parsed.metrics ?? {};
+  const firstSeries = (name: string): Series | undefined => {
+    const s = metrics[name]?.series;
+    return s && s.length > 0 ? s[0] : undefined;
+  };
+
+  // KV cache util — gauge in [0, 1].
+  const kvSeries =
+    firstSeries('vllm:kv_cache_usage_perc') ?? firstSeries('vllm:gpu_cache_usage_perc');
+  const kvCacheUtil: number[] = [];
+  for (const ts of kvSeries?.timeslices ?? []) {
+    if (typeof ts.avg === 'number') kvCacheUtil.push(ts.avg);
+  }
+
+  // Prefix cache hit rate per interval = hits.rate / queries.rate.
+  // Matches the derivation in queries/trace-server-metrics.ts.
+  const prefixCacheHitRate: number[] = [];
+  const hitsSeries = firstSeries('vllm:gpu_prefix_cache_hits');
+  const queriesSeries = firstSeries('vllm:gpu_prefix_cache_queries');
+  if (hitsSeries && queriesSeries) {
+    const qByStart = new Map<number, TimeSlice>();
+    for (const q of queriesSeries.timeslices ?? []) {
+      if (typeof q.start_ns === 'number') qByStart.set(q.start_ns, q);
+    }
+    for (const h of hitsSeries.timeslices ?? []) {
+      if (typeof h.start_ns !== 'number' || typeof h.rate !== 'number') continue;
+      const q = qByStart.get(h.start_ns);
+      if (!q || typeof q.rate !== 'number' || q.rate === 0) continue;
+      prefixCacheHitRate.push(h.rate / q.rate);
+    }
+  }
+
+  return { kvCacheUtil, prefixCacheHitRate };
+}
+
+export async function getAgenticAggregates(
+  sql: DbClient,
+  benchmarkResultIds: number[],
+): Promise<AgenticAggregateMap> {
+  if (benchmarkResultIds.length === 0) return {};
+
+  // Serial chunks so we never have more than ~`QUERY_CHUNK_SIZE` blobs in
+  // memory at once. Some `server_metrics` blobs decompress to >100 MB; running
+  // all chunks in parallel OOMs the Node process. The aggregator is fronted by
+  // a blob cache (`blobOnly: true`), so the slow path runs at most once per
+  // sibling set.
+  const result: AgenticAggregateMap = {};
+  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
+    const chunkRows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.profile_export_jsonl_gz as profile_blob,
+        atr.server_metrics_json_gz as server_blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+    `) as {
+      benchmark_result_id: number;
+      profile_blob: Buffer | null;
+      server_blob: Buffer | null;
+    }[];
+    for (const row of chunkRows) {
+      processRow(row, result);
+    }
+  }
+  return result;
+}
+
+function processRow(
+  row: { benchmark_result_id: number; profile_blob: Buffer | null; server_blob: Buffer | null },
+  result: AgenticAggregateMap,
+): void {
+  let islPct: MetricPercentiles | null = null;
+  let oslPct: MetricPercentiles | null = null;
+  let kvPct: MetricPercentiles | null = null;
+  let prefixPct: MetricPercentiles | null = null;
+
+  if (row.profile_blob) {
+    try {
+      const jsonl = gunzipSync(row.profile_blob).toString('utf8');
+      const { isl, osl } = extractIslOsl(jsonl);
+      islPct = percentilesOf(isl);
+      oslPct = percentilesOf(osl);
+    } catch {
+      // ignore malformed blob
+    }
+  }
+  if (row.server_blob) {
+    try {
+      const json = gunzipSync(row.server_blob).toString('utf8');
+      const { kvCacheUtil, prefixCacheHitRate } = extractServerMetricSamples(json);
+      kvPct = percentilesOf(kvCacheUtil);
+      prefixPct = percentilesOf(prefixCacheHitRate);
+    } catch {
+      // ignore malformed blob
+    }
+  }
+
+  result[Number(row.benchmark_result_id)] = {
+    id: Number(row.benchmark_result_id),
+    isl: islPct,
+    osl: oslPct,
+    kvCacheUtil: kvPct,
+    prefixCacheHitRate: prefixPct,
+  };
+}

From 41ef33b21e6a34430be20e812e6eedbd7b8f90cf Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 16:13:49 -0500
Subject: [PATCH 36/55] fix(agentic-aggregates): metric name + stream-parse
 oversized blobs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two issues left the Aggregates view mostly empty for the worst-case rows:

1. Prefix cache hit rate was null for EVERY row because the parser looked
   up `vllm:gpu_prefix_cache_*` but the actual metric names are
   `vllm:prefix_cache_*` (no `gpu_` prefix). Add fallback so both spellings
   work.

2. KV cache util + prefix cache hit rate were null for high-conc TP+EP
   rows. Their server_metrics_json decompresses past Node's max string
   length (0x1fffffe8 / 512 MB) because vllm dumps cache_config_info into
   every scrape interval, repeated thousands of times. `gunzipSync().toString()`
   threw ERR_STRING_TOO_LONG and the silent catch left both metrics null.

   Added stream-json fallback: pipe Buffer → gunzip → JSON parser →
   pick('metrics') → streamObject; only the metric keys we care about land
   in memory. Avoids ever materializing the 500+ MB JSON string. The
   fast path stays — sync gunzip + JSON.parse is used unless it throws.

Also split the DB fetch into two passes (profile blobs in batches of 8,
server blobs one at a time) so the server query response stays under
Neon's 64 MB HTTP cap on rows where the compressed server blob alone is
~17 MB and Neon's bytea-over-HTTP encoding inflates it ~1.6×.

Chart redesign: AggregateChart now draws a vertical bar per sibling
spanning the percentile range, with colored ticks at each percentile and
a diamond at the mean. Horizontal connecting lines per percentile remain
as a faint backdrop so the reader can still follow trends across configs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/aggregate-chart.tsx         |  72 ++++++-
 packages/db/package.json                      |   5 +-
 .../db/src/queries/agentic-aggregates.test.ts |   4 +-
 packages/db/src/queries/agentic-aggregates.ts | 197 ++++++++++++------
 pnpm-lock.yaml                                |  36 ++++
 5 files changed, 242 insertions(+), 72 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
index 446677ad..55ac8061 100644
--- a/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
@@ -188,10 +188,10 @@ export function AggregateChart({
           opacity={0.25}
         />
 
-        {/* Percentile polylines + markers */}
+        {/* Horizontal connecting lines per percentile — faint backdrop so the
+            eye can follow how each percentile changes across configs. */}
         {PERCENTILE_LINES.map((line) => {
           const segments: { x1: number; y1: number; x2: number; y2: number }[] = [];
-          const markers: { x: number; y: number }[] = [];
           let prev: { x: number; y: number } | null = null;
           for (let i = 0; i < points.length; i++) {
             const v = points[i]!.values[line.key];
@@ -201,12 +201,11 @@ export function AggregateChart({
             }
             const x = xOf(i);
             const y = yOf(v);
-            markers.push({ x, y });
             if (prev) segments.push({ x1: prev.x, y1: prev.y, x2: x, y2: y });
             prev = { x, y };
           }
           return (
-            <g key={line.key}>
+            <g key={`hline-${line.key}`} opacity={0.35}>
               {segments.map((s, j) => (
                 <line
                   key={`s${j}`}
@@ -215,12 +214,69 @@ export function AggregateChart({
                   x2={s.x2}
                   y2={s.y2}
                   stroke={line.color}
-                  strokeWidth={1.5}
+                  strokeWidth={1}
                 />
               ))}
-              {markers.map((m, j) => (
-                <circle key={`m${j}`} cx={m.x} cy={m.y} r={3} fill={line.color} />
-              ))}
+            </g>
+          );
+        })}
+
+        {/* Per-sibling vertical bar spanning the percentile range, with a
+            colored tick at each percentile level. Mean rendered as a small
+            diamond to distinguish from the percentile ticks. */}
+        {points.map((p, i) => {
+          const x = xOf(i);
+          // Collect percentile values present for this sibling.
+          const present = PERCENTILE_LINES.filter(
+            (line) =>
+              typeof p.values[line.key] === 'number' && Number.isFinite(p.values[line.key]!),
+          ).map((line) => ({ ...line, value: p.values[line.key]! }));
+          if (present.length === 0) return null;
+          // Only the *percentile* values define the bar extent; mean might be
+          // outside the percentile span on weird distributions.
+          const pctlOnly = present.filter((p2) => p2.key !== 'mean');
+          const bandValues = pctlOnly.length > 0 ? pctlOnly : present;
+          const bandYs = bandValues.map((b) => yOf(b.value));
+          const yLo = Math.min(...bandYs);
+          const yHi = Math.max(...bandYs);
+          return (
+            <g key={`bar-${i}`}>
+              <line
+                x1={x}
+                x2={x}
+                y1={yLo}
+                y2={yHi}
+                stroke="currentColor"
+                strokeWidth={1}
+                opacity={0.35}
+              />
+              {present.map((b) => {
+                const ty = yOf(b.value);
+                if (b.key === 'mean') {
+                  // Diamond marker for mean.
+                  const s = 4;
+                  return (
+                    <polygon
+                      key={`m-${b.key}`}
+                      points={`${x},${ty - s} ${x + s},${ty} ${x},${ty + s} ${x - s},${ty}`}
+                      fill={b.color}
+                      stroke={b.color}
+                    />
+                  );
+                }
+                // Horizontal tick at each percentile.
+                return (
+                  <line
+                    key={`tk-${b.key}`}
+                    x1={x - 6}
+                    x2={x + 6}
+                    y1={ty}
+                    y2={ty}
+                    stroke={b.color}
+                    strokeWidth={2.5}
+                  />
+                );
+              })}
             </g>
           );
         })}
diff --git a/packages/db/package.json b/packages/db/package.json
index c849ea26..d7caf34d 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -30,11 +30,14 @@
     "@neondatabase/serverless": "^1.1.0",
     "@noble/ciphers": "^2.2.0",
     "@semianalysisai/inferencex-constants": "workspace:*",
-    "postgres": "^3.4.9"
+    "postgres": "^3.4.9",
+    "stream-chain": "^3.4.0",
+    "stream-json": "^2.1.0"
   },
   "devDependencies": {
     "@types/adm-zip": "^0.5.8",
     "@types/node": "^25.7.0",
+    "@types/stream-json": "^1.7.8",
     "@vitest/coverage-v8": "^4.1.6",
     "adm-zip": "^0.5.17",
     "dotenv-cli": "^11.0.0",
diff --git a/packages/db/src/queries/agentic-aggregates.test.ts b/packages/db/src/queries/agentic-aggregates.test.ts
index 2a0305bf..8c712323 100644
--- a/packages/db/src/queries/agentic-aggregates.test.ts
+++ b/packages/db/src/queries/agentic-aggregates.test.ts
@@ -76,7 +76,7 @@ describe('extractServerMetricSamples', () => {
             },
           ],
         },
-        'vllm:gpu_prefix_cache_hits': {
+        'vllm:prefix_cache_hits': {
           series: [
             {
               timeslices: [
@@ -87,7 +87,7 @@ describe('extractServerMetricSamples', () => {
             },
           ],
         },
-        'vllm:gpu_prefix_cache_queries': {
+        'vllm:prefix_cache_queries': {
           series: [
             {
               timeslices: [
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
index 49ae6900..22ec7b28 100644
--- a/packages/db/src/queries/agentic-aggregates.ts
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -13,7 +13,14 @@
  * or has no usable samples — frontend treats those as "no data".
  */
 
-import { gunzipSync } from 'node:zlib';
+import { Readable } from 'node:stream';
+import { createGunzip, gunzipSync } from 'node:zlib';
+
+import { chain } from 'stream-chain';
+
+import { parser } from 'stream-json';
+import { pick } from 'stream-json/filters/pick.js';
+import { streamObject } from 'stream-json/streamers/stream-object.js';
 
 import type { DbClient } from '../connection.js';
 
@@ -38,12 +45,15 @@ export interface AgenticAggregate {
 export type AgenticAggregateMap = Record<number, AgenticAggregate>;
 
 /**
- * Each row pulls TWO compressed blobs (profile_export + server_metrics).
- * `server_metrics_json_gz` can be up to ~17 MB compressed for high-conc
- * runs, so even 3 rows can clear Neon's 64 MB cap. Stay conservative at 2.
- * Chunks are issued in parallel below, so the wall-clock impact is small.
+ * `profile_export_jsonl_gz` is small (~1-3 MB) so we can batch many per
+ * round-trip. `server_metrics_json_gz` is much bigger (~17 MB compressed
+ * for high-conc TP+EP runs; Neon encodes bytea over HTTP at ~1.6× wire
+ * size, so two of those = ~50 MB and three already trips the 64 MB cap).
+ * We fetch the two blob types in separate queries with different chunk
+ * sizes.
  */
-const QUERY_CHUNK_SIZE = 2;
+const PROFILE_CHUNK_SIZE = 8;
+const SERVER_CHUNK_SIZE = 1;
 
 /** Linear-interpolated percentile (matches numpy default). */
 function quantile(sortedAsc: number[], q: number): number {
@@ -162,9 +172,14 @@ export function extractServerMetricSamples(json: string): {
 
   // Prefix cache hit rate per interval = hits.rate / queries.rate.
   // Matches the derivation in queries/trace-server-metrics.ts.
+  // Metric names: vllm exposes these as `vllm:prefix_cache_*` (no `gpu_`
+  // prefix); falls back to the `gpu_`-prefixed names in case a future
+  // vllm version renames them.
   const prefixCacheHitRate: number[] = [];
-  const hitsSeries = firstSeries('vllm:gpu_prefix_cache_hits');
-  const queriesSeries = firstSeries('vllm:gpu_prefix_cache_queries');
+  const hitsSeries =
+    firstSeries('vllm:prefix_cache_hits') ?? firstSeries('vllm:gpu_prefix_cache_hits');
+  const queriesSeries =
+    firstSeries('vllm:prefix_cache_queries') ?? firstSeries('vllm:gpu_prefix_cache_queries');
   if (hitsSeries && queriesSeries) {
     const qByStart = new Map<number, TimeSlice>();
     for (const q of queriesSeries.timeslices ?? []) {
@@ -181,75 +196,135 @@ export function extractServerMetricSamples(json: string): {
   return { kvCacheUtil, prefixCacheHitRate };
 }
 
+/** Metrics our aggregates pipeline cares about. Anything else in the blob is skipped. */
+const TARGET_METRIC_KEYS = new Set([
+  'vllm:kv_cache_usage_perc',
+  'vllm:gpu_cache_usage_perc', // older fallback name
+  'vllm:prefix_cache_hits',
+  'vllm:prefix_cache_queries',
+  'vllm:gpu_prefix_cache_hits', // legacy alias (used in pre-fix code paths)
+  'vllm:gpu_prefix_cache_queries',
+]);
+
+/**
+ * Stream-parse the gzipped server_metrics_json and collect ONLY the metrics
+ * we need. Avoids the Node 512 MB string cap that JSON.parse hits on
+ * server_metrics blobs from high-conc TP+EP runs (which can decompress to
+ * >500 MB because vllm dumps `cache_config_info` every scrape interval).
+ *
+ * Pipeline: Buffer → gunzip → JSON parser → Pick('metrics') →
+ * StreamObject (one metric per chunk) → keep only the keys we care about.
+ *
+ * Returns the same `{ kvCacheUtil, prefixCacheHitRate }` shape as the
+ * synchronous fast path so callers can use either interchangeably.
+ */
+async function streamExtractServerMetricSamples(
+  buffer: Buffer,
+): Promise<{ kvCacheUtil: number[]; prefixCacheHitRate: number[] }> {
+  const collected: Record<string, MetricMeta> = {};
+  // stream-json's TypeScript types don't compose cleanly with node:stream's
+  // pipeline() generic, and several `.pipe()`/event APIs are typed loosely —
+  // cast to any for this local pipe chain. It works at runtime.
+  // stream-json composes transforms via stream-chain. `pick`/`streamObject`
+  // each return a Transform when called; `chain([...])` wires them.
+  /* eslint-disable @typescript-eslint/no-explicit-any */
+  const pipeline = chain([
+    Readable.from(buffer),
+    createGunzip(),
+    parser(),
+    pick({ filter: 'metrics' }),
+    streamObject(),
+  ]);
+  await new Promise<void>((resolve, reject) => {
+    (pipeline as any).on('data', (chunk: unknown) => {
+      const { key, value } = chunk as { key: string; value: MetricMeta };
+      if (TARGET_METRIC_KEYS.has(key)) collected[key] = value;
+    });
+    (pipeline as any).on('end', resolve);
+    (pipeline as any).on('error', reject);
+  });
+  /* eslint-enable @typescript-eslint/no-explicit-any */
+  return extractServerMetricSamples(JSON.stringify({ metrics: collected }));
+}
+
 export async function getAgenticAggregates(
   sql: DbClient,
   benchmarkResultIds: number[],
 ): Promise<AgenticAggregateMap> {
   if (benchmarkResultIds.length === 0) return {};
 
-  // Serial chunks so we never have more than ~`QUERY_CHUNK_SIZE` blobs in
-  // memory at once. Some `server_metrics` blobs decompress to >100 MB; running
-  // all chunks in parallel OOMs the Node process. The aggregator is fronted by
-  // a blob cache (`blobOnly: true`), so the slow path runs at most once per
-  // sibling set.
   const result: AgenticAggregateMap = {};
-  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
-    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
-    const chunkRows = (await sql`
+  // ── Pass 1: profile_export blobs (cheap; large batches). ────────────────
+  for (let i = 0; i < benchmarkResultIds.length; i += PROFILE_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + PROFILE_CHUNK_SIZE);
+    const rows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.profile_export_jsonl_gz as profile_blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+    `) as { benchmark_result_id: number; profile_blob: Buffer | null }[];
+    for (const row of rows) {
+      const id = Number(row.benchmark_result_id);
+      result[id] ??= blankAggregate(id);
+      if (row.profile_blob) {
+        try {
+          const jsonl = gunzipSync(row.profile_blob).toString('utf8');
+          const { isl, osl } = extractIslOsl(jsonl);
+          result[id].isl = percentilesOf(isl);
+          result[id].osl = percentilesOf(osl);
+        } catch {
+          // ignore malformed blob
+        }
+      }
+    }
+  }
+  // ── Pass 2: server_metrics blobs (huge; one at a time). ────────────────
+  // Serial to avoid OOM on the decompressed JSON of a high-conc TP+EP row
+  // (>500 MB raw). The aggregator is fronted by a blob cache, so the slow
+  // path runs at most once per sibling set.
+  for (let i = 0; i < benchmarkResultIds.length; i += SERVER_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + SERVER_CHUNK_SIZE);
+    const rows = (await sql`
       select
         br.id as benchmark_result_id,
-        atr.profile_export_jsonl_gz as profile_blob,
         atr.server_metrics_json_gz as server_blob
       from benchmark_results br
       join agentic_trace_replay atr on atr.id = br.trace_replay_id
       where br.id = any(${chunk}::bigint[])
-    `) as {
-      benchmark_result_id: number;
-      profile_blob: Buffer | null;
-      server_blob: Buffer | null;
-    }[];
-    for (const row of chunkRows) {
-      processRow(row, result);
+    `) as { benchmark_result_id: number; server_blob: Buffer | null }[];
+    for (const row of rows) {
+      const id = Number(row.benchmark_result_id);
+      result[id] ??= blankAggregate(id);
+      if (!row.server_blob) continue;
+      let parsed: { kvCacheUtil: number[]; prefixCacheHitRate: number[] } | null = null;
+      try {
+        const json = gunzipSync(row.server_blob).toString('utf8');
+        parsed = extractServerMetricSamples(json);
+      } catch (error) {
+        // ERR_STRING_TOO_LONG (>512 MB) hits on high-conc TP+EP rows whose
+        // server_metrics_json decompresses past Node's max string length.
+        // Stream-parse to extract just the metric subtrees we care about.
+        const code = error && (error as NodeJS.ErrnoException).code;
+        const msg = error instanceof Error ? error.message : String(error);
+        if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) {
+          try {
+            parsed = await streamExtractServerMetricSamples(row.server_blob);
+          } catch {
+            // stream fallback failed too — leave nulls
+          }
+        }
+      }
+      if (parsed) {
+        result[id].kvCacheUtil = percentilesOf(parsed.kvCacheUtil);
+        result[id].prefixCacheHitRate = percentilesOf(parsed.prefixCacheHitRate);
+      }
     }
   }
   return result;
 }
 
-function processRow(
-  row: { benchmark_result_id: number; profile_blob: Buffer | null; server_blob: Buffer | null },
-  result: AgenticAggregateMap,
-): void {
-  let islPct: MetricPercentiles | null = null;
-  let oslPct: MetricPercentiles | null = null;
-  let kvPct: MetricPercentiles | null = null;
-  let prefixPct: MetricPercentiles | null = null;
-
-  if (row.profile_blob) {
-    try {
-      const jsonl = gunzipSync(row.profile_blob).toString('utf8');
-      const { isl, osl } = extractIslOsl(jsonl);
-      islPct = percentilesOf(isl);
-      oslPct = percentilesOf(osl);
-    } catch {
-      // ignore malformed blob
-    }
-  }
-  if (row.server_blob) {
-    try {
-      const json = gunzipSync(row.server_blob).toString('utf8');
-      const { kvCacheUtil, prefixCacheHitRate } = extractServerMetricSamples(json);
-      kvPct = percentilesOf(kvCacheUtil);
-      prefixPct = percentilesOf(prefixCacheHitRate);
-    } catch {
-      // ignore malformed blob
-    }
-  }
-
-  result[Number(row.benchmark_result_id)] = {
-    id: Number(row.benchmark_result_id),
-    isl: islPct,
-    osl: oslPct,
-    kvCacheUtil: kvPct,
-    prefixCacheHitRate: prefixPct,
-  };
+function blankAggregate(id: number): AgenticAggregate {
+  return { id, isl: null, osl: null, kvCacheUtil: null, prefixCacheHitRate: null };
 }
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 14505e57..717ffc5c 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -249,6 +249,12 @@ importers:
       postgres:
         specifier: ^3.4.9
         version: 3.4.9
+      stream-chain:
+        specifier: ^3.4.0
+        version: 3.6.3
+      stream-json:
+        specifier: ^2.1.0
+        version: 2.1.0
     devDependencies:
       '@types/adm-zip':
         specifier: ^0.5.8
@@ -256,6 +262,9 @@ importers:
       '@types/node':
         specifier: ^25.7.0
         version: 25.7.0
+      '@types/stream-json':
+        specifier: ^1.7.8
+        version: 1.7.8
       '@vitest/coverage-v8':
         specifier: ^4.1.6
         version: 4.1.6(vitest@4.1.6)
@@ -2334,6 +2343,12 @@ packages:
   '@types/stats.js@0.17.4':
     resolution: {integrity: sha512-jIBvWWShCvlBqBNIZt0KAshWpvSjhkwkEu4ZUcASoAvhmrgAUI2t1dXrjSL4xXVLB4FznPrIsX3nKXFl/Dt4vA==}
 
+  '@types/stream-chain@2.1.0':
+    resolution: {integrity: sha512-guDyAl6s/CAzXUOWpGK2bHvdiopLIwpGu8v10+lb9hnQOyo4oj/ZUQFOvqFjKGsE3wJP1fpIesCcMvbXuWsqOg==}
+
+  '@types/stream-json@1.7.8':
+    resolution: {integrity: sha512-MU1OB1eFLcYWd1LjwKXrxdoPtXSRzRmAnnxs4Js/ayB5O/NvHraWwuOaqMWIebpYwM6khFlsJOHEhI9xK/ab4Q==}
+
   '@types/three@0.184.1':
     resolution: {integrity: sha512-6q4VdiqVsrTRqmk62/BnlcAvIrnDM0zf2ZDVKI5kZiniWrSaOHaQzmbp+BNzoggc/8tgW412pL//wZIxu2PPTA==}
 
@@ -5074,9 +5089,15 @@ packages:
     resolution: {integrity: sha512-eLoXW/DHyl62zxY4SCaIgnRhuMr6ri4juEYARS8E6sCEqzKpOiE521Ucofdx+KnDZl5xmvGYaaKCk5FEOxJCoQ==}
     engines: {node: '>= 0.4'}
 
+  stream-chain@3.6.3:
+    resolution: {integrity: sha512-JZuELdHUuiZL4Olcr4EllGUvj9VKEaDkGHA6QAP5SruD0bgrr8TwtNXwRfH+fCncysEII7HhWll1+aOwvHYyRw==}
+
   stream-combiner@0.2.2:
     resolution: {integrity: sha512-6yHMqgLYDzQDcAkL+tjJDC5nSNuNIx0vZtRZeiPh7Saef7VHX9H5Ijn9l2VIol2zaNYlYEX6KyuT/237A58qEQ==}
 
+  stream-json@2.1.0:
+    resolution: {integrity: sha512-9gV/ywtebMn3DdKnNKYCb9iESvgR1dHbucNV+bRGvdvy+jV4c9FFgYKmENhpKv58jSwvs90Wk80RhfKk1KxHPg==}
+
   string-width@4.2.3:
     resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==}
     engines: {node: '>=8'}
@@ -7392,6 +7413,15 @@ snapshots:
 
   '@types/stats.js@0.17.4': {}
 
+  '@types/stream-chain@2.1.0':
+    dependencies:
+      '@types/node': 25.7.0
+
+  '@types/stream-json@1.7.8':
+    dependencies:
+      '@types/node': 25.7.0
+      '@types/stream-chain': 2.1.0
+
   '@types/three@0.184.1':
     dependencies:
       '@dimforge/rapier3d-compat': 0.12.0
@@ -10752,11 +10782,17 @@ snapshots:
       es-errors: 1.3.0
       internal-slot: 1.1.0
 
+  stream-chain@3.6.3: {}
+
   stream-combiner@0.2.2:
     dependencies:
       duplexer: 0.1.2
       through: 2.3.8
 
+  stream-json@2.1.0:
+    dependencies:
+      stream-chain: 3.6.3
+
   string-width@4.2.3:
     dependencies:
       emoji-regex: 8.0.0

From 1cedd240e95b52789690919cc4b13600920d842f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 17:04:02 -0500
Subject: [PATCH 37/55] feat(agentic-aggregates): pre-compute stats at ingest
 time

Detail page was decompressing + parsing every trace_replay blob on each
request, sometimes hitting Node's 512 MB string cap on high-conc TP+EP
server_metrics_json. Pre-compute the percentile + derived bundles into
a versioned `aggregate_stats` JSONB column, mirroring the pattern Alec
suggested. APIs read the column first and only fall back to the slow
blob-parse path for rows the backfill hasn't drained.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../008_agentic_aggregate_stats.sql           |  18 +++
 packages/db/package.json                      |   1 +
 packages/db/src/backfill-aggregate-stats.ts   | 150 ++++++++++++++++++
 .../src/etl/compute-aggregate-stats.test.ts   | 123 ++++++++++++++
 .../db/src/etl/compute-aggregate-stats.ts     | 147 +++++++++++++++++
 packages/db/src/etl/trace-replay-ingest.ts    |  17 +-
 packages/db/src/queries/agentic-aggregates.ts |  77 ++++++++-
 .../db/src/queries/derived-agentic-metrics.ts |  47 +++++-
 8 files changed, 569 insertions(+), 11 deletions(-)
 create mode 100644 packages/db/migrations/008_agentic_aggregate_stats.sql
 create mode 100644 packages/db/src/backfill-aggregate-stats.ts
 create mode 100644 packages/db/src/etl/compute-aggregate-stats.test.ts
 create mode 100644 packages/db/src/etl/compute-aggregate-stats.ts

diff --git a/packages/db/migrations/008_agentic_aggregate_stats.sql b/packages/db/migrations/008_agentic_aggregate_stats.sql
new file mode 100644
index 00000000..d55533b9
--- /dev/null
+++ b/packages/db/migrations/008_agentic_aggregate_stats.sql
@@ -0,0 +1,18 @@
+-- Pre-computed aggregate stats for each agentic_trace_replay row.
+--
+-- Previously the agentic detail page parsed the (huge) profile_export.jsonl
+-- and server_metrics_json blobs on every request to compute distribution
+-- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived
+-- metrics (session-time, p90 prefill TPS). That took ~20s per row and the
+-- worst rows (high-conc TP+EP server_metrics blobs that decompress past
+-- Node's 512 MB string cap) couldn't be parsed without a stream fallback.
+--
+-- This column holds the computed stats so the API serves the page from a
+-- single SQL row read. Shape mirrors the existing benchmark_results.metrics
+-- JSONB convention; an inner `version` field lets the backfill script
+-- detect rows whose stats were computed by an older algorithm and
+-- recompute them. Null when stats haven't been computed yet (existing
+-- rows pre-backfill; the API has a slow-path fallback for that case).
+
+alter table agentic_trace_replay
+  add column aggregate_stats jsonb;
diff --git a/packages/db/package.json b/packages/db/package.json
index d7caf34d..f3f92311 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -19,6 +19,7 @@
     "db:ingest:supplemental": "dotenv -e ../../.env -- tsx src/ingest-supplemental.ts",
     "db:migrate": "dotenv -e ../../.env -- tsx src/migrate.ts",
     "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts",
+    "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts",
     "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts",
     "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts",
     "db:reset": "dotenv -e ../../.env -- tsx src/reset-db.ts",
diff --git a/packages/db/src/backfill-aggregate-stats.ts b/packages/db/src/backfill-aggregate-stats.ts
new file mode 100644
index 00000000..8dd42dce
--- /dev/null
+++ b/packages/db/src/backfill-aggregate-stats.ts
@@ -0,0 +1,150 @@
+/**
+ * Backfill `agentic_trace_replay.aggregate_stats` for rows that are missing it
+ * or were computed by an older `STATS_VERSION`.
+ *
+ * The ingest path now computes stats inline, but existing rows (and rows
+ * whose computation logic has since changed) still need this pass. Run after
+ * applying migration 008 and any time `STATS_VERSION` bumps.
+ *
+ * Strategy:
+ *   - Stream rows one at a time (server_metrics_json_gz can be hundreds of
+ *     MB decompressed for TP+EP / high-conc points — keeping one in memory
+ *     at a time avoids OOM).
+ *   - Skip rows whose stored `aggregate_stats.version` already matches.
+ *   - Recompute via the same `computeAggregateStats()` helper the ingest
+ *     path uses, so behavior cannot drift.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-aggregate-stats
+ *     [--limit N]   only process the first N candidate rows (useful for
+ *                   smoke-tests on a fresh deploy)
+ *     [--force]     recompute every row, even if version already matches
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import { computeAggregateStats, STATS_VERSION } from './etl/compute-aggregate-stats.js';
+import { createAdminSql } from './etl/db-utils.js';
+
+interface CliFlags {
+  limit: number | null;
+  force: boolean;
+}
+
+function parseFlags(): CliFlags {
+  let limit: number | null = null;
+  let force = false;
+  for (let i = 2; i < process.argv.length; i++) {
+    const arg = process.argv[i]!;
+    if (arg === '--force') force = true;
+    else if (arg === '--limit') {
+      const next = process.argv[++i];
+      if (!next || Number.isNaN(Number(next))) {
+        console.error('--limit requires a numeric argument');
+        process.exit(1);
+      }
+      limit = Number(next);
+    }
+  }
+  return { limit, force };
+}
+
+const flags = parseFlags();
+
+const sql = createAdminSql({
+  noSsl: hasNoSslFlag(),
+  max: 1,
+  onnotice: () => {},
+});
+
+async function main(): Promise<void> {
+  console.log('=== backfill-aggregate-stats ===');
+  console.log(`  STATS_VERSION = ${STATS_VERSION}`);
+  console.log(`  force = ${flags.force}`);
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // Find candidates: rows missing stats, or whose stored version is stale.
+  // Using >>'version'::int comparison would error on null; coalesce to -1 so
+  // null-stats rows always count as stale.
+  const candidates = flags.force
+    ? await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `
+    : await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where aggregate_stats is null
+           or coalesce((aggregate_stats->>'version')::int, -1) <> ${STATS_VERSION}
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `;
+
+  if (candidates.length === 0) {
+    console.log('\n  Nothing to do — all rows up to date.');
+    return;
+  }
+
+  console.log(`\n  ${candidates.length} candidate row(s).`);
+  if (!hasYesFlag()) {
+    const ok = await confirm('\nProceed? (y/N) ');
+    if (!ok) {
+      console.log('Aborted.');
+      return;
+    }
+  }
+
+  let ok = 0;
+  let failed = 0;
+  const t0 = Date.now();
+  for (const { id } of candidates) {
+    const start = Date.now();
+    try {
+      // Fetch one row at a time — the json_gz blob is the heavy field.
+      const [row] = await sql<
+        { profile_export_jsonl_gz: Buffer | null; server_metrics_json_gz: Buffer | null }[]
+      >`
+        select profile_export_jsonl_gz, server_metrics_json_gz
+        from agentic_trace_replay
+        where id = ${id}
+      `;
+      if (!row) {
+        console.warn(`  id=${id}: row vanished, skipping`);
+        continue;
+      }
+
+      const stats = await computeAggregateStats({
+        profileBlob: row.profile_export_jsonl_gz,
+        serverBlob: row.server_metrics_json_gz,
+      });
+
+      await sql`
+        update agentic_trace_replay
+        set aggregate_stats = ${sql.json(structuredClone(stats) as unknown as Parameters<typeof sql.json>[0])}
+        where id = ${id}
+      `;
+      ok++;
+      const elapsed = Math.round((Date.now() - start) / 1000);
+      const elapsedTotal = Math.round((Date.now() - t0) / 1000);
+      console.log(
+        `  ✓ id=${id} (${elapsed}s, ${ok}/${candidates.length} done, ${elapsedTotal}s total)`,
+      );
+    } catch (error) {
+      failed++;
+      console.error(`  ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`);
+    }
+  }
+
+  const totalSec = Math.round((Date.now() - t0) / 1000);
+  console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`);
+  if (failed > 0) process.exitCode = 1;
+}
+
+main()
+  .catch((error) => {
+    console.error('backfill-aggregate-stats failed:', error);
+    process.exitCode = 1;
+  })
+  .finally(() => sql.end());
diff --git a/packages/db/src/etl/compute-aggregate-stats.test.ts b/packages/db/src/etl/compute-aggregate-stats.test.ts
new file mode 100644
index 00000000..de0009de
--- /dev/null
+++ b/packages/db/src/etl/compute-aggregate-stats.test.ts
@@ -0,0 +1,123 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { STATS_VERSION, computeAggregateStats } from './compute-aggregate-stats.js';
+
+/** Build a minimal `profile_export.jsonl` from a few synthetic requests. */
+function makeProfileBlob(requests: { isl: number; osl: number; rl?: number; ttft?: number }[]) {
+  const lines = requests.map((r, i) =>
+    JSON.stringify({
+      metadata: {
+        benchmark_phase: 'profiling',
+        conversation_id: `conv-${i}`,
+        turn_index: 0,
+      },
+      metrics: {
+        input_sequence_length: { value: r.isl, unit: 'tokens' },
+        output_sequence_length: { value: r.osl, unit: 'tokens' },
+        request_latency: { value: r.rl ?? 1000, unit: 'ms' },
+        time_to_first_token: { value: r.ttft ?? 100, unit: 'ms' },
+      },
+    }),
+  );
+  return gzipSync(Buffer.from(lines.join('\n')));
+}
+
+/** Build a tiny server_metrics_json blob with KV util + prefix cache series. */
+function makeServerBlob() {
+  const json = JSON.stringify({
+    metrics: {
+      'vllm:kv_cache_usage_perc': {
+        series: [
+          {
+            timeslices: [
+              { start_ns: 0, end_ns: 1, avg: 0.2 },
+              { start_ns: 1, end_ns: 2, avg: 0.5 },
+              { start_ns: 2, end_ns: 3, avg: 0.8 },
+            ],
+          },
+        ],
+      },
+      'vllm:prefix_cache_hits': {
+        series: [{ timeslices: [{ start_ns: 0, rate: 80 }] }],
+      },
+      'vllm:prefix_cache_queries': {
+        series: [{ timeslices: [{ start_ns: 0, rate: 100 }] }],
+      },
+    },
+  });
+  return gzipSync(Buffer.from(json));
+}
+
+describe('computeAggregateStats', () => {
+  it('returns the current STATS_VERSION in the bundle', async () => {
+    const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null });
+    expect(stats.version).toBe(STATS_VERSION);
+  });
+
+  it('leaves every metric null when both blobs are null', async () => {
+    const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null });
+    expect(stats.isl).toBeNull();
+    expect(stats.osl).toBeNull();
+    expect(stats.kvCacheUtil).toBeNull();
+    expect(stats.prefixCacheHitRate).toBeNull();
+    expect(stats.normalizedSessionTimeS).toBeNull();
+    expect(stats.p90PrefillTpsPerUser).toBeNull();
+  });
+
+  it('computes ISL/OSL percentiles + derived metrics from the profile blob', async () => {
+    const profileBlob = makeProfileBlob([
+      { isl: 100, osl: 50, rl: 1000, ttft: 100 },
+      { isl: 200, osl: 75, rl: 2000, ttft: 200 },
+      { isl: 300, osl: 100, rl: 3000, ttft: 300 },
+    ]);
+    const stats = await computeAggregateStats({ profileBlob, serverBlob: null });
+
+    expect(stats.isl?.n).toBe(3);
+    expect(stats.isl?.mean).toBeCloseTo(200, 6);
+    expect(stats.osl?.n).toBe(3);
+    expect(stats.osl?.mean).toBeCloseTo(75, 6);
+
+    // Server-side metrics still null when there's no server blob.
+    expect(stats.kvCacheUtil).toBeNull();
+    expect(stats.prefixCacheHitRate).toBeNull();
+
+    // Derived: prefill TPS per turn = isl / (ttft/1000) = 1000 for each, so p90 = 1000.
+    expect(stats.p90PrefillTpsPerUser).toBeCloseTo(1000, 6);
+    // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean.
+    //   loads = [150, 275, 400], mean_load = 275
+    //   scaled times (s) = [1×275/150, 2×275/275, 3×275/400] = [1.8333, 2, 2.0625]
+    //   mean ≈ 1.9653
+    expect(stats.normalizedSessionTimeS).toBeCloseTo(1.9653, 3);
+  });
+
+  it('computes KV util + prefix hit rate from the server blob alone', async () => {
+    const stats = await computeAggregateStats({
+      profileBlob: null,
+      serverBlob: makeServerBlob(),
+    });
+    expect(stats.kvCacheUtil?.n).toBe(3);
+    expect(stats.kvCacheUtil?.mean).toBeCloseTo(0.5, 6);
+    expect(stats.prefixCacheHitRate?.n).toBe(1);
+    expect(stats.prefixCacheHitRate?.mean).toBeCloseTo(0.8, 6);
+
+    // Profile-derived metrics absent.
+    expect(stats.isl).toBeNull();
+    expect(stats.osl).toBeNull();
+    expect(stats.normalizedSessionTimeS).toBeNull();
+    expect(stats.p90PrefillTpsPerUser).toBeNull();
+  });
+
+  it('tolerates a malformed profile blob by leaving its metrics null', async () => {
+    // A random non-gzip buffer triggers a gunzip error — code path swallows it.
+    const garbage = Buffer.from('not-gzip-data');
+    const stats = await computeAggregateStats({ profileBlob: garbage, serverBlob: null });
+    expect(stats.isl).toBeNull();
+    expect(stats.osl).toBeNull();
+    expect(stats.normalizedSessionTimeS).toBeNull();
+    expect(stats.p90PrefillTpsPerUser).toBeNull();
+    // Version still set so the row is considered "computed".
+    expect(stats.version).toBe(STATS_VERSION);
+  });
+});
diff --git a/packages/db/src/etl/compute-aggregate-stats.ts b/packages/db/src/etl/compute-aggregate-stats.ts
new file mode 100644
index 00000000..a422cfec
--- /dev/null
+++ b/packages/db/src/etl/compute-aggregate-stats.ts
@@ -0,0 +1,147 @@
+/**
+ * Pre-compute the per-row aggregate stats for an `agentic_trace_replay`
+ * blob pair. The output lands in the `aggregate_stats` JSONB column so the
+ * detail page can serve the "Aggregates across configs" view and the
+ * derived chart x-axis modes from a single SQL row read, instead of
+ * parsing the raw blobs on demand.
+ *
+ * Shape is intentionally versioned — bump `STATS_VERSION` whenever the
+ * computation changes so the backfill script knows which rows to recompute.
+ */
+
+import { Readable } from 'node:stream';
+import { createGunzip, gunzipSync } from 'node:zlib';
+
+import { chain } from 'stream-chain';
+
+import { parser } from 'stream-json';
+import { pick } from 'stream-json/filters/pick.js';
+import { streamObject } from 'stream-json/streamers/stream-object.js';
+
+import { computeDerivedFromBlob } from '../queries/derived-agentic-metrics.js';
+import {
+  STATS_VERSION,
+  extractIslOsl,
+  extractServerMetricSamples,
+  percentilesOf,
+  type MetricPercentiles,
+} from '../queries/agentic-aggregates.js';
+
+export { STATS_VERSION };
+
+export interface AggregateStats {
+  version: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+  /** Mean of (per-session e2e time × mean_load / session_load) across sessions. */
+  normalizedSessionTimeS: number | null;
+  /** P90 of per-turn ISL/TTFT pooled across every session's turns. */
+  p90PrefillTpsPerUser: number | null;
+}
+
+/** Metric subtrees we extract via stream-parse on oversized server blobs. */
+const TARGET_METRIC_KEYS = new Set([
+  'vllm:kv_cache_usage_perc',
+  'vllm:gpu_cache_usage_perc',
+  'vllm:prefix_cache_hits',
+  'vllm:prefix_cache_queries',
+  'vllm:gpu_prefix_cache_hits',
+  'vllm:gpu_prefix_cache_queries',
+]);
+
+/**
+ * Stream-parse the gzipped server_metrics_json and collect just the metric
+ * subtrees we care about. Avoids Node's 512 MB max-string-length cap that
+ * `gunzipSync().toString('utf8')` hits on high-conc TP+EP rows.
+ */
+async function streamExtractServer(
+  buffer: Buffer,
+): Promise<{ kvCacheUtil: number[]; prefixCacheHitRate: number[] }> {
+  /* eslint-disable @typescript-eslint/no-explicit-any */
+  const collected: Record<string, unknown> = {};
+  const pipelineStream = chain([
+    Readable.from(buffer),
+    createGunzip(),
+    parser(),
+    pick({ filter: 'metrics' }),
+    streamObject(),
+  ]);
+  await new Promise<void>((resolve, reject) => {
+    (pipelineStream as any).on('data', (chunk: unknown) => {
+      const { key, value } = chunk as { key: string; value: unknown };
+      if (TARGET_METRIC_KEYS.has(key)) collected[key] = value;
+    });
+    (pipelineStream as any).on('end', resolve);
+    (pipelineStream as any).on('error', reject);
+  });
+  /* eslint-enable @typescript-eslint/no-explicit-any */
+  return extractServerMetricSamples(JSON.stringify({ metrics: collected }));
+}
+
+/**
+ * Compute the full versioned stats bundle from a (profile, server-metrics)
+ * blob pair. Either blob may be null (e.g. only the server file existed) —
+ * the corresponding stats just come back null.
+ */
+export async function computeAggregateStats(args: {
+  profileBlob: Buffer | null;
+  serverBlob: Buffer | null;
+}): Promise<AggregateStats> {
+  let islPct: MetricPercentiles | null = null;
+  let oslPct: MetricPercentiles | null = null;
+  let normalized: number | null = null;
+  let prefillP90: number | null = null;
+
+  if (args.profileBlob) {
+    try {
+      const jsonl = gunzipSync(args.profileBlob).toString('utf8');
+      const { isl, osl } = extractIslOsl(jsonl);
+      islPct = percentilesOf(isl);
+      oslPct = percentilesOf(osl);
+      const derived = computeDerivedFromBlob(jsonl);
+      normalized = derived.normalized_session_time_s;
+      prefillP90 = derived.p90_prefill_tps_per_user;
+    } catch {
+      // ignore malformed blob — leave nulls
+    }
+  }
+
+  let kvPct: MetricPercentiles | null = null;
+  let prefixPct: MetricPercentiles | null = null;
+  if (args.serverBlob) {
+    let server: { kvCacheUtil: number[]; prefixCacheHitRate: number[] } | null = null;
+    try {
+      const json = gunzipSync(args.serverBlob).toString('utf8');
+      server = extractServerMetricSamples(json);
+    } catch (error) {
+      const code = error && (error as NodeJS.ErrnoException).code;
+      const msg = error instanceof Error ? error.message : String(error);
+      // ERR_STRING_TOO_LONG hits on high-conc TP+EP rows. Stream-parse to
+      // pull just the metric subtrees we need without materializing the
+      // full 500+ MB JSON string.
+      if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) {
+        try {
+          server = await streamExtractServer(args.serverBlob);
+        } catch {
+          // stream fallback failed too — leave nulls
+        }
+      }
+    }
+    if (server) {
+      kvPct = percentilesOf(server.kvCacheUtil);
+      prefixPct = percentilesOf(server.prefixCacheHitRate);
+    }
+  }
+
+  return {
+    version: STATS_VERSION,
+    isl: islPct,
+    osl: oslPct,
+    kvCacheUtil: kvPct,
+    prefixCacheHitRate: prefixPct,
+    normalizedSessionTimeS: normalized,
+    p90PrefillTpsPerUser: prefillP90,
+  };
+}
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index 8c6d92b6..423f70e7 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -12,6 +12,8 @@ import { gzipSync } from 'node:zlib';
 
 import type postgres from 'postgres';
 
+import { computeAggregateStats } from './compute-aggregate-stats.js';
+
 type Sql = ReturnType<typeof postgres>;
 
 /**
@@ -55,6 +57,15 @@ export async function insertTraceReplay(
   const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
   const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
 
+  // Pre-compute the aggregate stats so the detail page / aggregates view
+  // doesn't have to re-parse these blobs on every request. The compute
+  // function tolerates one-or-both blobs being null and falls back to a
+  // streaming parser for oversized server_metrics blobs.
+  const aggregateStats = await computeAggregateStats({
+    profileBlob: profileGz,
+    serverBlob: metricsJsonGz,
+  });
+
   const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
     insert into agentic_trace_replay (
       profile_export_jsonl_gz,
@@ -62,7 +73,8 @@ export async function insertTraceReplay(
       server_metrics_csv,
       server_metrics_csv_size,
       server_metrics_json_gz,
-      server_metrics_json_uncompressed_size
+      server_metrics_json_uncompressed_size,
+      aggregate_stats
     )
     values (
       ${profileGz},
@@ -70,7 +82,8 @@ export async function insertTraceReplay(
       ${serverMetricsCsv},
       ${csvSize},
       ${metricsJsonGz},
-      ${metricsJsonSize}
+      ${metricsJsonSize},
+      ${sql.json(structuredClone(aggregateStats) as unknown as Parameters<typeof sql.json>[0])}
     )
     returning id
   `;
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
index 22ec7b28..8ac4f678 100644
--- a/packages/db/src/queries/agentic-aggregates.ts
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -24,6 +24,14 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
 
 import type { DbClient } from '../connection.js';
 
+/**
+ * Bump when the aggregate-stats computation algorithm changes — the backfill
+ * script recomputes any row whose stored `aggregate_stats.version` is older.
+ * Lives here (rather than in compute-aggregate-stats.ts) to avoid a circular
+ * import: the compute helper depends on the percentile utilities below.
+ */
+export const STATS_VERSION = 1;
+
 export interface MetricPercentiles {
   mean: number;
   p50: number;
@@ -254,9 +262,55 @@ export async function getAgenticAggregates(
   if (benchmarkResultIds.length === 0) return {};
 
   const result: AgenticAggregateMap = {};
-  // ── Pass 1: profile_export blobs (cheap; large batches). ────────────────
-  for (let i = 0; i < benchmarkResultIds.length; i += PROFILE_CHUNK_SIZE) {
-    const chunk = benchmarkResultIds.slice(i, i + PROFILE_CHUNK_SIZE);
+
+  // Fast path: read the pre-computed `aggregate_stats` JSONB written by the
+  // ingest pipeline (and back-filled by `backfill-aggregate-stats.ts`). One
+  // round-trip pulls everything we need for every requested id with no blob
+  // decompression, so the slow blob-parsing fallback only runs for ids
+  // whose stats are missing or were produced by an older `STATS_VERSION`.
+  const statsRows = (await sql`
+    select
+      br.id as benchmark_result_id,
+      atr.aggregate_stats as stats
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = any(${benchmarkResultIds}::bigint[])
+  `) as {
+    benchmark_result_id: number;
+    stats: AggregateStatsRow | null;
+  }[];
+
+  const idsNeedingProfile: number[] = [];
+  const idsNeedingServer: number[] = [];
+  for (const row of statsRows) {
+    const id = Number(row.benchmark_result_id);
+    const agg = blankAggregate(id);
+    if (row.stats && Number(row.stats.version) === STATS_VERSION) {
+      agg.isl = row.stats.isl ?? null;
+      agg.osl = row.stats.osl ?? null;
+      agg.kvCacheUtil = row.stats.kvCacheUtil ?? null;
+      agg.prefixCacheHitRate = row.stats.prefixCacheHitRate ?? null;
+    } else {
+      // No stats (or stale version) — schedule the blob-parse fallback below
+      // so the response still surfaces data. Backfill should drain these.
+      idsNeedingProfile.push(id);
+      idsNeedingServer.push(id);
+    }
+    result[id] = agg;
+  }
+  // Also fall back for ids that didn't return a row at all (no trace_replay
+  // link) — keep the caller contract: every id we know about lands in the map.
+  for (const id of benchmarkResultIds) {
+    if (!(id in result)) result[id] = blankAggregate(id);
+  }
+
+  if (idsNeedingProfile.length === 0 && idsNeedingServer.length === 0) {
+    return result;
+  }
+
+  // ── Fallback Pass 1: profile_export blobs (cheap; large batches). ──────
+  for (let i = 0; i < idsNeedingProfile.length; i += PROFILE_CHUNK_SIZE) {
+    const chunk = idsNeedingProfile.slice(i, i + PROFILE_CHUNK_SIZE);
     const rows = (await sql`
       select
         br.id as benchmark_result_id,
@@ -280,12 +334,12 @@ export async function getAgenticAggregates(
       }
     }
   }
-  // ── Pass 2: server_metrics blobs (huge; one at a time). ────────────────
+  // ── Fallback Pass 2: server_metrics blobs (huge; one at a time). ───────
   // Serial to avoid OOM on the decompressed JSON of a high-conc TP+EP row
   // (>500 MB raw). The aggregator is fronted by a blob cache, so the slow
   // path runs at most once per sibling set.
-  for (let i = 0; i < benchmarkResultIds.length; i += SERVER_CHUNK_SIZE) {
-    const chunk = benchmarkResultIds.slice(i, i + SERVER_CHUNK_SIZE);
+  for (let i = 0; i < idsNeedingServer.length; i += SERVER_CHUNK_SIZE) {
+    const chunk = idsNeedingServer.slice(i, i + SERVER_CHUNK_SIZE);
     const rows = (await sql`
       select
         br.id as benchmark_result_id,
@@ -325,6 +379,17 @@ export async function getAgenticAggregates(
   return result;
 }
 
+/** Shape of the JSONB column when read back via postgres-js. */
+interface AggregateStatsRow {
+  version: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+  normalizedSessionTimeS: number | null;
+  p90PrefillTpsPerUser: number | null;
+}
+
 function blankAggregate(id: number): AgenticAggregate {
   return { id, isl: null, osl: null, kvCacheUtil: null, prefixCacheHitRate: null };
 }
diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
index ac6fd38d..a14a1727 100644
--- a/packages/db/src/queries/derived-agentic-metrics.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -21,6 +21,7 @@
 import { gunzipSync } from 'node:zlib';
 
 import type { DbClient } from '../connection.js';
+import { STATS_VERSION } from './agentic-aggregates.js';
 
 export interface DerivedAgenticMetric {
   /** benchmark_results.id this entry belongs to. */
@@ -190,9 +191,50 @@ export async function getDerivedAgenticMetrics(
 ): Promise<DerivedAgenticMetricMap> {
   if (benchmarkResultIds.length === 0) return {};
 
+  const result: DerivedAgenticMetricMap = {};
+
+  // Fast path: read the pre-computed values out of `aggregate_stats`. The
+  // ingest pipeline computes both metrics in the same pass that produces the
+  // percentile bundles, so a single SQL round-trip covers most ids without
+  // touching the gzipped profile blob.
+  const statsRows = (await sql`
+    select
+      br.id as benchmark_result_id,
+      atr.aggregate_stats as stats
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = any(${benchmarkResultIds}::bigint[])
+  `) as {
+    benchmark_result_id: number;
+    stats: {
+      version?: number;
+      normalizedSessionTimeS?: number | null;
+      p90PrefillTpsPerUser?: number | null;
+    } | null;
+  }[];
+
+  const idsNeedingBlob: number[] = [];
+  for (const row of statsRows) {
+    const id = Number(row.benchmark_result_id);
+    if (row.stats && Number(row.stats.version) === STATS_VERSION) {
+      result[id] = {
+        id,
+        normalized_session_time_s: row.stats.normalizedSessionTimeS ?? null,
+        p90_prefill_tps_per_user: row.stats.p90PrefillTpsPerUser ?? null,
+      };
+    } else {
+      idsNeedingBlob.push(id);
+    }
+  }
+
+  if (idsNeedingBlob.length === 0) return result;
+
+  // Fallback: parse the profile blob directly. Used for rows whose
+  // `aggregate_stats` is null or computed by an older STATS_VERSION; the
+  // backfill script drains the population so this path should be rare.
   const rows: { benchmark_result_id: number; blob: Buffer }[] = [];
-  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
-    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
+  for (let i = 0; i < idsNeedingBlob.length; i += QUERY_CHUNK_SIZE) {
+    const chunk = idsNeedingBlob.slice(i, i + QUERY_CHUNK_SIZE);
     const chunkRows = (await sql`
       select
         br.id as benchmark_result_id,
@@ -205,7 +247,6 @@ export async function getDerivedAgenticMetrics(
     rows.push(...chunkRows);
   }
 
-  const result: DerivedAgenticMetricMap = {};
   for (const row of rows) {
     try {
       const jsonl = gunzipSync(row.blob).toString('utf8');

From 9d9c7c13413c16a147b176691782827d5ee8d21d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 17:07:30 -0500
Subject: [PATCH 38/55] fix(agentic-aggregates): drop .js extension on
 app-route-traced import
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Turbopack doesn't do TypeScript's `.js → .ts` substitution when an
app-route bundles an intra-package value import, so the new
`STATS_VERSION` import broke the /api/v1/derived-agentic-metrics
route. The same `.js` value-import pattern works for files not pulled
into an app route (e.g. workflow-run.ts → run-overrides.ts) so the
existing intra-package imports are left alone.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/queries/derived-agentic-metrics.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
index a14a1727..35a4b76c 100644
--- a/packages/db/src/queries/derived-agentic-metrics.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -21,7 +21,7 @@
 import { gunzipSync } from 'node:zlib';
 
 import type { DbClient } from '../connection.js';
-import { STATS_VERSION } from './agentic-aggregates.js';
+import { STATS_VERSION } from './agentic-aggregates';
 
 export interface DerivedAgenticMetric {
   /** benchmark_results.id this entry belongs to. */

From 6063d01e2d563951d70dea699edd30a6b06df81a Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 17:18:31 -0500
Subject: [PATCH 39/55] feat(agentic-detail): pre-compute chart_series at
 ingest time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Detail page was parsing the entire server_metrics_json_gz blob on every
request — fine for small rows, but TP+EP high-conc rows decompress past
Node's 512 MB max-string-length cap and threw ERR_STRING_TOO_LONG,
killing the page for point 206242 et al.

Extends the Alec-pattern to the time-series path: new `chart_series`
JSONB column holds pre-extracted kvCacheUsage, prefixCacheHitRate,
queueDepth, prefillTps, decodeTps, and promptTokensBySource arrays.
The API fast-path is a single SQL row read; the slow path (compute
from blob, with stream-parse fallback for oversized rows) only runs
for rows whose chart_series is missing or stale-versioned.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../migrations/009_agentic_chart_series.sql   |  19 ++
 packages/db/package.json                      |   1 +
 packages/db/src/backfill-chart-series.ts      | 154 ++++++++++
 .../db/src/etl/compute-chart-series.test.ts   | 129 +++++++++
 packages/db/src/etl/compute-chart-series.ts   | 268 ++++++++++++++++++
 packages/db/src/etl/trace-replay-ingest.ts    |  21 +-
 .../db/src/queries/trace-server-metrics.ts    | 261 +++++------------
 7 files changed, 654 insertions(+), 199 deletions(-)
 create mode 100644 packages/db/migrations/009_agentic_chart_series.sql
 create mode 100644 packages/db/src/backfill-chart-series.ts
 create mode 100644 packages/db/src/etl/compute-chart-series.test.ts
 create mode 100644 packages/db/src/etl/compute-chart-series.ts

diff --git a/packages/db/migrations/009_agentic_chart_series.sql b/packages/db/migrations/009_agentic_chart_series.sql
new file mode 100644
index 00000000..b42718b9
--- /dev/null
+++ b/packages/db/migrations/009_agentic_chart_series.sql
@@ -0,0 +1,19 @@
+-- Pre-computed time-series for the agentic detail page chart.
+--
+-- Sibling to `aggregate_stats` (migration 008): that column stores
+-- per-row percentile/derived *summaries*, this one stores the full
+-- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate,
+-- queueDepth, prefillTps, decodeTps, promptTokensBySource).
+--
+-- Without this, the detail page parsed the entire `server_metrics_json_gz`
+-- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc
+-- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length).
+-- With pre-computed series the page is a single SQL row read.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored series were produced by an older algorithm.
+-- Null when the series haven't been computed yet; the API has a slow-path
+-- fallback (with stream-parse for oversized blobs) for that case.
+
+alter table agentic_trace_replay
+  add column chart_series jsonb;
diff --git a/packages/db/package.json b/packages/db/package.json
index f3f92311..f97c442a 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -20,6 +20,7 @@
     "db:migrate": "dotenv -e ../../.env -- tsx src/migrate.ts",
     "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts",
     "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts",
+    "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts",
     "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts",
     "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts",
     "db:reset": "dotenv -e ../../.env -- tsx src/reset-db.ts",
diff --git a/packages/db/src/backfill-chart-series.ts b/packages/db/src/backfill-chart-series.ts
new file mode 100644
index 00000000..66156b45
--- /dev/null
+++ b/packages/db/src/backfill-chart-series.ts
@@ -0,0 +1,154 @@
+/**
+ * Backfill `agentic_trace_replay.chart_series` for rows that are missing it
+ * or were computed by an older `CHART_SERIES_VERSION`.
+ *
+ * The ingest path now computes the time-series inline, but existing rows
+ * (and rows whose computation logic has since changed) still need this
+ * pass. Run after applying migration 009 and any time `CHART_SERIES_VERSION`
+ * bumps.
+ *
+ * Strategy:
+ *   - Stream rows one at a time (server_metrics_json_gz can decompress
+ *     past 500 MB on high-conc TP+EP points — one in memory at a time
+ *     avoids OOM).
+ *   - Skip rows whose stored version already matches.
+ *   - Recompute via the same `computeChartSeries()` helper the ingest
+ *     path uses, so behavior cannot drift.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-chart-series
+ *     [--limit N]   only process the first N candidate rows
+ *     [--force]     recompute every row, even if version already matches
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import { CHART_SERIES_VERSION, computeChartSeries } from './etl/compute-chart-series.js';
+import { createAdminSql } from './etl/db-utils.js';
+
+interface CliFlags {
+  limit: number | null;
+  force: boolean;
+}
+
+function parseFlags(): CliFlags {
+  let limit: number | null = null;
+  let force = false;
+  for (let i = 2; i < process.argv.length; i++) {
+    const arg = process.argv[i]!;
+    if (arg === '--force') force = true;
+    else if (arg === '--limit') {
+      const next = process.argv[++i];
+      if (!next || Number.isNaN(Number(next))) {
+        console.error('--limit requires a numeric argument');
+        process.exit(1);
+      }
+      limit = Number(next);
+    }
+  }
+  return { limit, force };
+}
+
+const flags = parseFlags();
+
+const sql = createAdminSql({
+  noSsl: hasNoSslFlag(),
+  max: 1,
+  onnotice: () => {},
+});
+
+async function main(): Promise<void> {
+  console.log('=== backfill-chart-series ===');
+  console.log(`  CHART_SERIES_VERSION = ${CHART_SERIES_VERSION}`);
+  console.log(`  force = ${flags.force}`);
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // Only rows that actually have a server_metrics blob can produce a
+  // chart_series. Rows without the blob legitimately keep `chart_series`
+  // null and the API serves them via the slow path (which also returns
+  // null because there's no blob to parse — so the page falls into the
+  // "no stored trace_replay blob" branch).
+  const candidates = flags.force
+    ? await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where server_metrics_json_gz is not null
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `
+    : await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where server_metrics_json_gz is not null
+          and (
+            chart_series is null
+            or coalesce((chart_series->>'version')::int, -1) <> ${CHART_SERIES_VERSION}
+          )
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `;
+
+  if (candidates.length === 0) {
+    console.log('\n  Nothing to do — all rows up to date.');
+    return;
+  }
+
+  console.log(`\n  ${candidates.length} candidate row(s).`);
+  if (!hasYesFlag()) {
+    const ok = await confirm('\nProceed? (y/N) ');
+    if (!ok) {
+      console.log('Aborted.');
+      return;
+    }
+  }
+
+  let ok = 0;
+  let failed = 0;
+  const t0 = Date.now();
+  for (const { id } of candidates) {
+    const start = Date.now();
+    try {
+      const [row] = await sql<{ server_metrics_json_gz: Buffer | null }[]>`
+        select server_metrics_json_gz
+        from agentic_trace_replay
+        where id = ${id}
+      `;
+      if (!row) {
+        console.warn(`  id=${id}: row vanished, skipping`);
+        continue;
+      }
+
+      const series = await computeChartSeries(row.server_metrics_json_gz);
+
+      await sql`
+        update agentic_trace_replay
+        set chart_series = ${
+          series === null
+            ? null
+            : sql.json(structuredClone(series) as unknown as Parameters<typeof sql.json>[0])
+        }
+        where id = ${id}
+      `;
+      ok++;
+      const elapsed = Math.round((Date.now() - start) / 1000);
+      const elapsedTotal = Math.round((Date.now() - t0) / 1000);
+      console.log(
+        `  ✓ id=${id} (${elapsed}s, ${ok}/${candidates.length} done, ${elapsedTotal}s total)`,
+      );
+    } catch (error) {
+      failed++;
+      console.error(`  ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`);
+    }
+  }
+
+  const totalSec = Math.round((Date.now() - t0) / 1000);
+  console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`);
+  if (failed > 0) process.exitCode = 1;
+}
+
+main()
+  .catch((error) => {
+    console.error('backfill-chart-series failed:', error);
+    process.exitCode = 1;
+  })
+  .finally(() => sql.end());
diff --git a/packages/db/src/etl/compute-chart-series.test.ts b/packages/db/src/etl/compute-chart-series.test.ts
new file mode 100644
index 00000000..dafc7200
--- /dev/null
+++ b/packages/db/src/etl/compute-chart-series.test.ts
@@ -0,0 +1,129 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { CHART_SERIES_VERSION, computeChartSeries } from './compute-chart-series.js';
+
+/**
+ * Build a minimal server_metrics_json blob covering the metrics the chart
+ * consumes. Each timeslice is one second long starting at t=0.
+ */
+function makeBlob(opts?: {
+  prefixHits?: number;
+  prefixQueries?: number;
+  promptTokensRate?: number;
+}) {
+  const json = JSON.stringify({
+    metrics: {
+      'vllm:kv_cache_usage_perc': {
+        series: [
+          {
+            timeslices: [
+              { start_ns: 0, end_ns: 1e9, avg: 0.1 },
+              { start_ns: 1e9, end_ns: 2e9, avg: 0.4 },
+              { start_ns: 2e9, end_ns: 3e9, avg: 0.7 },
+            ],
+          },
+        ],
+      },
+      'vllm:prefix_cache_hits': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixHits ?? 75 }] }],
+      },
+      'vllm:prefix_cache_queries': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixQueries ?? 100 }] }],
+      },
+      'vllm:num_requests_running': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 5 }] }],
+      },
+      'vllm:num_requests_waiting': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 2 }] }],
+      },
+      'vllm:prompt_tokens': {
+        series: [
+          { timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.promptTokensRate ?? 1000 }] },
+        ],
+      },
+      'vllm:generation_tokens': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 500 }] }],
+      },
+      'vllm:prompt_tokens_by_source': {
+        series: [
+          {
+            labels: { source: 'local_cache_hit' },
+            timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 200 }],
+          },
+          {
+            labels: { source: 'miss' },
+            timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 800 }],
+          },
+        ],
+      },
+    },
+  });
+  return gzipSync(Buffer.from(json));
+}
+
+describe('computeChartSeries', () => {
+  it('returns null when the blob is null', async () => {
+    expect(await computeChartSeries(null)).toBeNull();
+  });
+
+  it('returns the current CHART_SERIES_VERSION in the bundle', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.version).toBe(CHART_SERIES_VERSION);
+  });
+
+  it('extracts kvCacheUsage points with t=seconds-from-start', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.kvCacheUsage).toEqual([
+      { t: 0, value: 0.1 },
+      { t: 1, value: 0.4 },
+      { t: 2, value: 0.7 },
+    ]);
+  });
+
+  it('computes prefixCacheHitRate as hits.rate / queries.rate', async () => {
+    const series = await computeChartSeries(makeBlob({ prefixHits: 80, prefixQueries: 100 }));
+    expect(series?.prefixCacheHitRate).toEqual([{ t: 0, value: 0.8 }]);
+  });
+
+  it('drops prefixCacheHitRate windows where queries.rate is 0', async () => {
+    const series = await computeChartSeries(makeBlob({ prefixHits: 5, prefixQueries: 0 }));
+    expect(series?.prefixCacheHitRate).toEqual([]);
+  });
+
+  it('pairs running + waiting into queueDepth points', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.queueDepth).toEqual([{ t: 0, running: 5, waiting: 2, total: 7 }]);
+  });
+
+  it('extracts prefillTps + decodeTps from counter rates', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.prefillTps).toEqual([{ t: 0, value: 1000 }]);
+    expect(series?.decodeTps).toEqual([{ t: 0, value: 500 }]);
+  });
+
+  it('splits promptTokensBySource by label and skips empty series', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(Object.keys(series!.promptTokensBySource).toSorted()).toEqual([
+      'local_cache_hit',
+      'miss',
+    ]);
+    expect(series!.promptTokensBySource['local_cache_hit']).toEqual([{ t: 0, value: 200 }]);
+    expect(series!.promptTokensBySource['miss']).toEqual([{ t: 0, value: 800 }]);
+  });
+
+  it('computes timing metadata from the widest metric window', async () => {
+    const series = await computeChartSeries(makeBlob());
+    // kvCacheUsage has the widest window (0 → 3e9), so startNs=0, endNs=3e9.
+    expect(series?.startNs).toBe(0);
+    expect(series?.endNs).toBe(3e9);
+    expect(series?.durationS).toBeCloseTo(3, 6);
+    expect(series?.timeslicesCount).toBe(3);
+  });
+
+  it('returns null on a malformed (non-gzip) blob', async () => {
+    const result = await computeChartSeries(Buffer.from('not-gzip-data'));
+    expect(result).toBeNull();
+  });
+});
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
new file mode 100644
index 00000000..3cb4181b
--- /dev/null
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -0,0 +1,268 @@
+/**
+ * Pre-compute the time-series for the agentic detail page chart, so the
+ * API doesn't have to gunzip + JSON-parse a multi-hundred-MB blob on every
+ * request. The output lands in `agentic_trace_replay.chart_series` and is
+ * read directly by `getTraceServerMetrics`.
+ *
+ * Versioned so the backfill script knows which rows are stale — bump
+ * `CHART_SERIES_VERSION` whenever the extraction algorithm changes.
+ */
+
+import { Readable } from 'node:stream';
+import { createGunzip, gunzipSync } from 'node:zlib';
+
+import { chain } from 'stream-chain';
+
+import { parser } from 'stream-json';
+import { pick } from 'stream-json/filters/pick.js';
+import { streamObject } from 'stream-json/streamers/stream-object.js';
+
+/** Bump when the extraction algorithm changes — backfill recomputes anything older. */
+export const CHART_SERIES_VERSION = 1;
+
+export interface TimeSeriesPoint {
+  /** Seconds from benchmark start. */
+  t: number;
+  value: number;
+}
+
+export interface QueueDepthPoint {
+  t: number;
+  running: number;
+  waiting: number;
+  total: number;
+}
+
+export interface ChartSeries {
+  version: number;
+  /** ns wall-clock of the first window's start; for debugging only. */
+  startNs: number;
+  /** ns wall-clock of the last window's end. */
+  endNs: number;
+  /** Total benchmark window in seconds. */
+  durationS: number;
+  /** Number of 1Hz windows captured. */
+  timeslicesCount: number;
+  kvCacheUsage: TimeSeriesPoint[];
+  prefixCacheHitRate: TimeSeriesPoint[];
+  queueDepth: QueueDepthPoint[];
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  prefillTps: TimeSeriesPoint[];
+  decodeTps: TimeSeriesPoint[];
+}
+
+// ── Raw blob shapes (subset we read) ────────────────────────────────────
+
+interface RawSlice {
+  start_ns?: number;
+  end_ns?: number;
+  avg?: number;
+  rate?: number;
+}
+
+interface RawSeries {
+  labels?: Record<string, string>;
+  timeslices?: RawSlice[];
+}
+
+interface RawMetric {
+  series?: RawSeries[];
+}
+
+type MetricsMap = Record<string, RawMetric>;
+
+/** The set of metric subtrees the chart consumes. */
+const CHART_METRIC_KEYS = new Set([
+  'vllm:kv_cache_usage_perc',
+  'vllm:gpu_cache_usage_perc',
+  'vllm:prefix_cache_hits',
+  'vllm:prefix_cache_queries',
+  'vllm:num_requests_running',
+  'vllm:num_requests_waiting',
+  'vllm:prompt_tokens',
+  'vllm:generation_tokens',
+  'vllm:prompt_tokens_by_source',
+]);
+
+/**
+ * Stream-parse the gzipped server_metrics_json and collect only the metric
+ * subtrees the chart needs. Avoids Node's 512 MB max-string-length cap that
+ * `gunzipSync(buffer).toString('utf8')` trips on high-conc TP+EP rows.
+ */
+async function streamCollectMetrics(buffer: Buffer): Promise<MetricsMap> {
+  /* eslint-disable @typescript-eslint/no-explicit-any */
+  const collected: MetricsMap = {};
+  const pipeline = chain([
+    Readable.from(buffer),
+    createGunzip(),
+    parser(),
+    pick({ filter: 'metrics' }),
+    streamObject(),
+  ]);
+  await new Promise<void>((resolve, reject) => {
+    (pipeline as any).on('data', (chunk: unknown) => {
+      const { key, value } = chunk as { key: string; value: RawMetric };
+      if (CHART_METRIC_KEYS.has(key)) collected[key] = value;
+    });
+    (pipeline as any).on('end', resolve);
+    (pipeline as any).on('error', reject);
+  });
+  /* eslint-enable @typescript-eslint/no-explicit-any */
+  return collected;
+}
+
+/**
+ * Parse the gzipped server_metrics blob into the metric map. Tries the
+ * synchronous fast path first; falls back to stream-parse on
+ * ERR_STRING_TOO_LONG so high-conc TP+EP rows succeed.
+ */
+async function parseMetrics(buffer: Buffer): Promise<MetricsMap> {
+  try {
+    const obj = JSON.parse(gunzipSync(buffer).toString('utf8')) as { metrics?: MetricsMap };
+    return obj.metrics ?? {};
+  } catch (error) {
+    const code = error && (error as NodeJS.ErrnoException).code;
+    const msg = error instanceof Error ? error.message : String(error);
+    if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) {
+      return await streamCollectMetrics(buffer);
+    }
+    throw error;
+  }
+}
+
+/**
+ * Build chart-ready time-series arrays from a gzipped server_metrics blob.
+ * The math mirrors `getTraceServerMetrics` — this helper exists so ingest,
+ * backfill, and the API path produce byte-identical results.
+ */
+export async function computeChartSeries(blob: Buffer | null): Promise<ChartSeries | null> {
+  if (!blob) return null;
+  let metrics: MetricsMap;
+  try {
+    metrics = await parseMetrics(blob);
+  } catch {
+    // Malformed blob → no series (caller treats null as "no data").
+    return null;
+  }
+  return buildSeriesFromMetrics(metrics);
+}
+
+/** Pull the first series under a metric key, or undefined. */
+function firstSeries(metrics: MetricsMap, name: string): RawSeries | undefined {
+  const s = metrics[name]?.series;
+  return s && s.length > 0 ? s[0] : undefined;
+}
+
+function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
+  // Timing reference: smallest start_ns and largest end_ns across every
+  // timeslice we extracted. (Same logic as the original getTraceServerMetrics
+  // — looking at every metric gives the widest possible window even if some
+  // series start late.)
+  let startNs = Number.POSITIVE_INFINITY;
+  let endNs = 0;
+  let timeslicesCount = 0;
+  for (const metricMeta of Object.values(metrics)) {
+    for (const s of metricMeta?.series ?? []) {
+      const ts = s.timeslices ?? [];
+      if (ts.length === 0) continue;
+      timeslicesCount = Math.max(timeslicesCount, ts.length);
+      const first = ts[0]!;
+      const last = ts.at(-1)!;
+      if (typeof first.start_ns === 'number' && first.start_ns < startNs) startNs = first.start_ns;
+      if (typeof last.end_ns === 'number' && last.end_ns > endNs) endNs = last.end_ns;
+    }
+  }
+  if (!Number.isFinite(startNs)) startNs = 0;
+  const tOf = (ns: number) => (ns - startNs) / 1e9;
+
+  // KV cache usage (gauge, 0..1)
+  const kvCacheUsage: TimeSeriesPoint[] = [];
+  const kvSeries =
+    firstSeries(metrics, 'vllm:kv_cache_usage_perc') ??
+    firstSeries(metrics, 'vllm:gpu_cache_usage_perc');
+  for (const ts of kvSeries?.timeslices ?? []) {
+    if (typeof ts.avg === 'number' && typeof ts.start_ns === 'number') {
+      kvCacheUsage.push({ t: tOf(ts.start_ns), value: ts.avg });
+    }
+  }
+
+  // Prefix cache hit rate per scrape (Δhits / Δqueries from counter rate).
+  const hitsTs = firstSeries(metrics, 'vllm:prefix_cache_hits')?.timeslices ?? [];
+  const qsTs = firstSeries(metrics, 'vllm:prefix_cache_queries')?.timeslices ?? [];
+  const prefixCacheHitRate: TimeSeriesPoint[] = [];
+  const minLen = Math.min(hitsTs.length, qsTs.length);
+  for (let i = 0; i < minLen; i++) {
+    const h = hitsTs[i]!;
+    const q = qsTs[i]!;
+    if (
+      typeof q.rate === 'number' &&
+      q.rate > 0 &&
+      typeof h.rate === 'number' &&
+      typeof h.start_ns === 'number'
+    ) {
+      prefixCacheHitRate.push({ t: tOf(h.start_ns), value: h.rate / q.rate });
+    }
+  }
+
+  // Queue depth: pair running + waiting by index.
+  const runTs = firstSeries(metrics, 'vllm:num_requests_running')?.timeslices ?? [];
+  const waitTs = firstSeries(metrics, 'vllm:num_requests_waiting')?.timeslices ?? [];
+  const queueDepth: QueueDepthPoint[] = [];
+  const qlen = Math.min(runTs.length, waitTs.length);
+  for (let i = 0; i < qlen; i++) {
+    const r = runTs[i]!;
+    const w = waitTs[i]!;
+    if (typeof r.start_ns !== 'number') continue;
+    const running = typeof r.avg === 'number' ? r.avg : 0;
+    const waiting = typeof w.avg === 'number' ? w.avg : 0;
+    queueDepth.push({
+      t: tOf(r.start_ns),
+      running,
+      waiting,
+      total: running + waiting,
+    });
+  }
+
+  // Throughput: extract counter `rate` (already per-second from aiperf).
+  const counterRate = (name: string): TimeSeriesPoint[] => {
+    const s = firstSeries(metrics, name);
+    if (!s) return [];
+    const out: TimeSeriesPoint[] = [];
+    for (const ts of s.timeslices ?? []) {
+      if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
+        out.push({ t: tOf(ts.start_ns), value: ts.rate });
+      }
+    }
+    return out;
+  };
+  const prefillTps = counterRate('vllm:prompt_tokens');
+  const decodeTps = counterRate('vllm:generation_tokens');
+
+  // Per-source prompt tokens — emit one TS array per source label.
+  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
+  for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
+    const labels = series.labels ?? {};
+    const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels);
+    const arr: TimeSeriesPoint[] = [];
+    for (const ts of series.timeslices ?? []) {
+      if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
+        arr.push({ t: tOf(ts.start_ns), value: ts.rate });
+      }
+    }
+    if (arr.length > 0) promptTokensBySource[source] = arr;
+  }
+
+  return {
+    version: CHART_SERIES_VERSION,
+    startNs,
+    endNs,
+    durationS: endNs > startNs ? (endNs - startNs) / 1e9 : 0,
+    timeslicesCount,
+    kvCacheUsage,
+    prefixCacheHitRate,
+    queueDepth,
+    promptTokensBySource,
+    prefillTps,
+    decodeTps,
+  };
+}
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index 423f70e7..f70200ff 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -13,6 +13,7 @@ import { gzipSync } from 'node:zlib';
 import type postgres from 'postgres';
 
 import { computeAggregateStats } from './compute-aggregate-stats.js';
+import { computeChartSeries } from './compute-chart-series.js';
 
 type Sql = ReturnType<typeof postgres>;
 
@@ -57,14 +58,14 @@ export async function insertTraceReplay(
   const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
   const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
 
-  // Pre-compute the aggregate stats so the detail page / aggregates view
-  // doesn't have to re-parse these blobs on every request. The compute
-  // function tolerates one-or-both blobs being null and falls back to a
+  // Pre-compute the aggregate stats + chart-ready time-series so the
+  // detail page / aggregates view doesn't have to re-parse these blobs on
+  // every request. Both helpers tolerate a null blob and fall back to a
   // streaming parser for oversized server_metrics blobs.
-  const aggregateStats = await computeAggregateStats({
-    profileBlob: profileGz,
-    serverBlob: metricsJsonGz,
-  });
+  const [aggregateStats, chartSeries] = await Promise.all([
+    computeAggregateStats({ profileBlob: profileGz, serverBlob: metricsJsonGz }),
+    computeChartSeries(metricsJsonGz),
+  ]);
 
   const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
     insert into agentic_trace_replay (
@@ -74,7 +75,8 @@ export async function insertTraceReplay(
       server_metrics_csv_size,
       server_metrics_json_gz,
       server_metrics_json_uncompressed_size,
-      aggregate_stats
+      aggregate_stats,
+      chart_series
     )
     values (
       ${profileGz},
@@ -83,7 +85,8 @@ export async function insertTraceReplay(
       ${csvSize},
       ${metricsJsonGz},
       ${metricsJsonSize},
-      ${sql.json(structuredClone(aggregateStats) as unknown as Parameters<typeof sql.json>[0])}
+      ${sql.json(structuredClone(aggregateStats) as unknown as Parameters<typeof sql.json>[0])},
+      ${chartSeries === null ? null : sql.json(structuredClone(chartSeries) as unknown as Parameters<typeof sql.json>[0])}
     )
     returning id
   `;
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
index 822ae633..624b6ed3 100644
--- a/packages/db/src/queries/trace-server-metrics.ts
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -1,73 +1,26 @@
 /**
- * Parse aiperf's `server_metrics_export.json` blob (gzipped in
- * `agentic_trace_replay.server_metrics_json_gz`) and return a slim, chart-ready
- * time-series for one benchmark point.
+ * Time-series view of one agentic benchmark point: chart-ready arrays for
+ * KV utilization, prefix-cache hit rate, queue depth, prefill + decode TPS,
+ * and per-source prompt-token counts.
  *
- * The raw JSON has shape:
- *   metrics: {
- *     "<metric_name>": {
- *       series: [
- *         {
- *           labels: { ... },
- *           stats: { ... summary ... },
- *           timeslices: [
- *             { start_ns, end_ns, avg, min, max }            // gauges
- *             { start_ns, end_ns, total, rate }              // counters
- *           ]
- *         }
- *       ]
- *     }
- *   }
- *
- * Timeslices are ~1 Hz windows. The benchmark window can be tens of minutes
- * (1800+ windows). We return them as `[{ t, ...}]` arrays with `t` measured
- * in seconds from the benchmark start so the frontend doesn't need to
- * shuffle bigint nanoseconds around.
+ * Backed by `agentic_trace_replay.chart_series` (pre-computed at ingest
+ * time, see `etl/compute-chart-series.ts`). The fast path is a single SQL
+ * row read; the slow path re-computes from `server_metrics_json_gz` and is
+ * only taken when the column is missing or the stored
+ * `CHART_SERIES_VERSION` is stale (the backfill script should drain that).
  */
 
-import { gunzipSync } from 'node:zlib';
+import {
+  CHART_SERIES_VERSION,
+  computeChartSeries,
+  type ChartSeries,
+  type QueueDepthPoint,
+  type TimeSeriesPoint,
+} from '../etl/compute-chart-series';
 
 import type { DbClient } from '../connection.js';
 
-interface GaugeSlice {
-  start_ns: number;
-  end_ns: number;
-  avg?: number;
-  min?: number;
-  max?: number;
-}
-
-interface CounterSlice {
-  start_ns: number;
-  end_ns: number;
-  total?: number;
-  rate?: number;
-}
-
-interface Series {
-  endpoint_url?: string;
-  labels?: Record<string, string>;
-  stats?: Record<string, unknown>;
-  timeslices?: (GaugeSlice & CounterSlice)[];
-}
-
-interface MetricsJson {
-  metrics?: Record<string, { type?: string; description?: string; series?: Series[] }>;
-}
-
-export interface TimeSeriesPoint {
-  /** Seconds from benchmark start. */
-  t: number;
-  value: number;
-}
-
-export interface QueueDepthPoint {
-  t: number;
-  running: number;
-  waiting: number;
-  /** Optional total — frontend can compute too. */
-  total: number;
-}
+export type { TimeSeriesPoint, QueueDepthPoint } from '../etl/compute-chart-series';
 
 export interface PointMeta {
   id: number;
@@ -120,30 +73,13 @@ export interface TraceServerMetrics {
   decodeTps: TimeSeriesPoint[];
 }
 
-export async function getTraceServerMetrics(
-  sql: DbClient,
-  benchmarkResultId: number,
-): Promise<TraceServerMetrics | null> {
-  const rows = (await sql`
-    select
-      atr.server_metrics_json_gz as blob,
-      br.id, c.hardware, c.framework, c.model, c.precision, c.spec_method, c.disagg,
-      br.conc, br.offload_mode, br.isl, br.osl, br.benchmark_type,
-      br.date::text,
-      case when wr.html_url is not null then wr.html_url || '/attempts/' || wr.run_attempt else null end as run_url,
-      (br.metrics ->> 'server_gpu_cache_hit_rate')::numeric as server_gpu_cache_hit_rate,
-      (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate
-    from benchmark_results br
-    join configs c on c.id = br.config_id
-    join workflow_runs wr on wr.id = br.workflow_run_id
-    left join agentic_trace_replay atr on atr.id = br.trace_replay_id
-    where br.id = ${benchmarkResultId}
-  `) as unknown as ({ blob: Buffer | null } & PointMeta)[];
-  const row = rows[0];
-  if (!row) return null;
-  const blob = row.blob;
-  if (!blob) return null;
-  const pointMeta: PointMeta = {
+interface RawMetaRow extends PointMeta {
+  blob: Buffer | null;
+  chart_series: ChartSeries | null;
+}
+
+function buildMeta(row: RawMetaRow): PointMeta {
+  return {
     id: Number(row.id),
     hardware: row.hardware,
     framework: row.framework,
@@ -163,113 +99,58 @@ export async function getTraceServerMetrics(
     server_cpu_cache_hit_rate:
       row.server_cpu_cache_hit_rate === null ? null : Number(row.server_cpu_cache_hit_rate),
   };
+}
 
-  const parsed = JSON.parse(gunzipSync(blob).toString('utf8')) as MetricsJson;
-  const metrics = parsed.metrics ?? {};
-
-  const firstSeries = (name: string): Series | undefined => {
-    const s = metrics[name]?.series;
-    return s && s.length > 0 ? s[0] : undefined;
+function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics {
+  return {
+    meta,
+    startNs: series.startNs,
+    endNs: series.endNs,
+    durationS: series.durationS,
+    timeslicesCount: series.timeslicesCount,
+    kvCacheUsage: series.kvCacheUsage,
+    prefixCacheHitRate: series.prefixCacheHitRate,
+    queueDepth: series.queueDepth,
+    promptTokensBySource: series.promptTokensBySource,
+    prefillTps: series.prefillTps,
+    decodeTps: series.decodeTps,
   };
+}
 
-  // Compute timing reference from the first gauge metric we can find.
-  let startNs = Number.POSITIVE_INFINITY;
-  let endNs = 0;
-  let timeslicesCount = 0;
-  for (const metricMeta of Object.values(metrics)) {
-    for (const s of metricMeta?.series ?? []) {
-      const ts = s.timeslices ?? [];
-      if (ts.length === 0) continue;
-      timeslicesCount = Math.max(timeslicesCount, ts.length);
-      const first = ts[0]!;
-      const last = ts.at(-1)!;
-      if (typeof first.start_ns === 'number' && first.start_ns < startNs) startNs = first.start_ns;
-      if (typeof last.end_ns === 'number' && last.end_ns > endNs) endNs = last.end_ns;
-    }
-  }
-  if (!Number.isFinite(startNs)) startNs = 0;
-  const tOf = (ns: number) => (ns - startNs) / 1e9;
-
-  // KV cache usage (gauge, 0..1)
-  const kvCacheUsage: TimeSeriesPoint[] = [];
-  const kvSeries =
-    firstSeries('vllm:kv_cache_usage_perc') ?? firstSeries('vllm:gpu_cache_usage_perc');
-  for (const ts of kvSeries?.timeslices ?? []) {
-    if (typeof ts.avg === 'number') {
-      kvCacheUsage.push({ t: tOf(ts.start_ns), value: ts.avg });
-    }
-  }
-
-  // Prefix cache hit rate per scrape (Δhits / Δqueries from counter rate).
-  // `rate` is already per-window delta; we just divide.
-  const hitsTs = firstSeries('vllm:prefix_cache_hits')?.timeslices ?? [];
-  const qsTs = firstSeries('vllm:prefix_cache_queries')?.timeslices ?? [];
-  const prefixCacheHitRate: TimeSeriesPoint[] = [];
-  const minLen = Math.min(hitsTs.length, qsTs.length);
-  for (let i = 0; i < minLen; i++) {
-    const h = hitsTs[i]!;
-    const q = qsTs[i]!;
-    if (typeof q.rate === 'number' && q.rate > 0 && typeof h.rate === 'number') {
-      prefixCacheHitRate.push({ t: tOf(h.start_ns), value: h.rate / q.rate });
-    }
-  }
-
-  // Queue depth: pair running + waiting by index.
-  const runTs = firstSeries('vllm:num_requests_running')?.timeslices ?? [];
-  const waitTs = firstSeries('vllm:num_requests_waiting')?.timeslices ?? [];
-  const queueDepth: QueueDepthPoint[] = [];
-  const qlen = Math.min(runTs.length, waitTs.length);
-  for (let i = 0; i < qlen; i++) {
-    const r = runTs[i]!;
-    const w = waitTs[i]!;
-    const running = typeof r.avg === 'number' ? r.avg : 0;
-    const waiting = typeof w.avg === 'number' ? w.avg : 0;
-    queueDepth.push({
-      t: tOf(r.start_ns),
-      running,
-      waiting,
-      total: running + waiting,
-    });
-  }
-
-  // Throughput: extract counter `rate` (already per-second delta from aiperf).
-  const counterRateSeries = (name: string): TimeSeriesPoint[] => {
-    const s = firstSeries(name);
-    if (!s) return [];
-    const out: TimeSeriesPoint[] = [];
-    for (const ts of s.timeslices ?? []) {
-      if (typeof ts.rate === 'number') out.push({ t: tOf(ts.start_ns), value: ts.rate });
-    }
-    return out;
-  };
-  const prefillTps = counterRateSeries('vllm:prompt_tokens');
-  const decodeTps = counterRateSeries('vllm:generation_tokens');
+export async function getTraceServerMetrics(
+  sql: DbClient,
+  benchmarkResultId: number,
+): Promise<TraceServerMetrics | null> {
+  const rows = (await sql`
+    select
+      atr.server_metrics_json_gz as blob,
+      atr.chart_series,
+      br.id, c.hardware, c.framework, c.model, c.precision, c.spec_method, c.disagg,
+      br.conc, br.offload_mode, br.isl, br.osl, br.benchmark_type,
+      br.date::text,
+      case when wr.html_url is not null then wr.html_url || '/attempts/' || wr.run_attempt else null end as run_url,
+      (br.metrics ->> 'server_gpu_cache_hit_rate')::numeric as server_gpu_cache_hit_rate,
+      (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate
+    from benchmark_results br
+    join configs c on c.id = br.config_id
+    join workflow_runs wr on wr.id = br.workflow_run_id
+    left join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = ${benchmarkResultId}
+  `) as unknown as RawMetaRow[];
+  const row = rows[0];
+  if (!row) return null;
+  if (!row.blob) return null;
+  const meta = buildMeta(row);
 
-  // Per-source prompt tokens — emit one TS array per source label.
-  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
-  for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
-    const labels = series.labels ?? {};
-    const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels);
-    const arr: TimeSeriesPoint[] = [];
-    for (const ts of series.timeslices ?? []) {
-      if (typeof ts.rate === 'number') {
-        arr.push({ t: tOf(ts.start_ns), value: ts.rate });
-      }
-    }
-    if (arr.length > 0) promptTokensBySource[source] = arr;
+  // Fast path: pre-computed chart_series at the current version.
+  if (row.chart_series && Number(row.chart_series.version) === CHART_SERIES_VERSION) {
+    return merge(meta, row.chart_series);
   }
 
-  return {
-    meta: pointMeta,
-    startNs,
-    endNs,
-    durationS: endNs > startNs ? (endNs - startNs) / 1e9 : 0,
-    timeslicesCount,
-    kvCacheUsage,
-    prefixCacheHitRate,
-    queueDepth,
-    promptTokensBySource,
-    prefillTps,
-    decodeTps,
-  };
+  // Slow path: compute from the blob. `computeChartSeries` handles
+  // ERR_STRING_TOO_LONG via a stream-parse fallback so high-conc TP+EP
+  // rows succeed even before the backfill drains them.
+  const series = await computeChartSeries(row.blob);
+  if (!series) return null;
+  return merge(meta, series);
 }

From 24fe8feae5175d80a53002fd4f3b3b77bb42e8c4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 22 May 2026 14:00:37 -0500
Subject: [PATCH 40/55] feat(agentic-detail): per-request Gantt timeline view
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a "Request timeline" view on the agentic point detail page, modeled
after the agent-timeline in semianalysis-claude-code-proxy. Each row is
a conversation (with sub-agent rows nested + indented under their
parent), each bar is one HTTP request from request_start → request_end
with a thin lead-in showing credit_issued → request_start queue wait.

Hover any bar for per-request stats (TTFT, ISL/OSL, queue wait, phase,
worker, agent depth). Move anywhere over the chart for a crosshair
that shows the cursor time + how many requests are running / waiting /
completed at that instant — O(log n) sweep counts so it stays smooth
on big runs.

Same Alec pattern as 008/009: migration 010 adds a `request_timeline`
JSONB column on agentic_trace_replay, computed at ingest time and
backfilled for existing rows. ~30 KB per row vs the ~1-3 MB raw blob.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../src/app/api/v1/request-timeline/route.ts  |  40 +
 .../agentic-point/agentic-point-detail.tsx    |  25 +-
 .../agentic-point/request-timeline.tsx        | 821 ++++++++++++++++++
 .../app/src/hooks/api/use-request-timeline.ts |  59 ++
 .../010_agentic_request_timeline.sql          |  15 +
 packages/db/package.json                      |   1 +
 packages/db/src/backfill-request-timeline.ts  | 144 +++
 .../src/etl/compute-request-timeline.test.ts  | 153 ++++
 .../db/src/etl/compute-request-timeline.ts    | 182 ++++
 packages/db/src/etl/trace-replay-ingest.ts    |  18 +-
 packages/db/src/queries/request-timeline.ts   |  48 +
 11 files changed, 1498 insertions(+), 8 deletions(-)
 create mode 100644 packages/app/src/app/api/v1/request-timeline/route.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/request-timeline.tsx
 create mode 100644 packages/app/src/hooks/api/use-request-timeline.ts
 create mode 100644 packages/db/migrations/010_agentic_request_timeline.sql
 create mode 100644 packages/db/src/backfill-request-timeline.ts
 create mode 100644 packages/db/src/etl/compute-request-timeline.test.ts
 create mode 100644 packages/db/src/etl/compute-request-timeline.ts
 create mode 100644 packages/db/src/queries/request-timeline.ts

diff --git a/packages/app/src/app/api/v1/request-timeline/route.ts b/packages/app/src/app/api/v1/request-timeline/route.ts
new file mode 100644
index 00000000..6c884fb2
--- /dev/null
+++ b/packages/app/src/app/api/v1/request-timeline/route.ts
@@ -0,0 +1,40 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getRequestTimeline,
+  type RequestTimeline,
+} from '@semianalysisai/inferencex-db/queries/request-timeline';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedRequestTimeline = cachedQuery(
+  (id: number): Promise<RequestTimeline | null> => getRequestTimeline(getDb(), id),
+  'request-timeline',
+  { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/request-timeline?id=N
+ *
+ * Returns the per-request Gantt timeline for one agentic benchmark point.
+ * Each request entry has ns-from-start offsets for credit/start/ack/end,
+ * plus TTFT, ISL, OSL, conversation id, turn index, worker id. 404 if the
+ * point has no stored profile_export.jsonl blob.
+ */
+export async function GET(request: NextRequest) {
+  const id = Number(request.nextUrl.searchParams.get('id'));
+  if (!id || !Number.isFinite(id)) {
+    return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 });
+  }
+  try {
+    const data = await getCachedRequestTimeline(id);
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching request timeline:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index a5bca4e0..2e43b4fb 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -6,6 +6,7 @@ import { useState } from 'react';
 import { ArrowLeft } from 'lucide-react';
 
 import { useAgenticAggregates, type AgenticAggregateMap } from '@/hooks/api/use-agentic-aggregates';
+import { useRequestTimeline } from '@/hooks/api/use-request-timeline';
 import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
 import {
   useTraceServerMetrics,
@@ -19,6 +20,7 @@ import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/seg
 import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart';
 import { Distribution } from './distribution';
 import { ExpandableChart } from './expandable-chart';
+import { RequestTimelineView } from './request-timeline';
 import { SiblingNav, chipLabel } from './sibling-nav';
 import {
   StackedAreaChart,
@@ -82,9 +84,10 @@ const CHART_SIZES = {
   expanded: { width: 1300, height: 520 },
 };
 
-type DetailView = 'point' | 'aggregates';
+type DetailView = 'point' | 'timeline' | 'aggregates';
 const VIEW_OPTIONS: SegmentedToggleOption<DetailView>[] = [
   { value: 'point', label: 'Per-point', testId: 'detail-view-point' },
+  { value: 'timeline', label: 'Request timeline', testId: 'detail-view-timeline' },
   { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' },
 ];
 
@@ -120,6 +123,8 @@ export function AgenticPointDetail({ id }: Props) {
   // shows how the metric varies across the SKU.
   const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? [];
   const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates');
+  // Per-request timeline fetched only when the timeline view is active.
+  const timelineQuery = useRequestTimeline(id, view === 'timeline');
 
   return (
     <div className="container mx-auto px-4 lg:px-8 flex flex-col gap-4 py-6">
@@ -176,6 +181,11 @@ export function AgenticPointDetail({ id }: Props) {
             {aggregatesQuery.isLoading ? ' · loading…' : ''}
           </span>
         )}
+        {view === 'timeline' && timelineQuery.data && (
+          <span className="text-xs text-muted-foreground">
+            {timelineQuery.data.requests.length} requests
+          </span>
+        )}
       </div>
 
       {view === 'aggregates' ? (
@@ -184,6 +194,19 @@ export function AgenticPointDetail({ id }: Props) {
           aggregates={aggregatesQuery.data}
           isLoading={aggregatesQuery.isLoading}
         />
+      ) : view === 'timeline' ? (
+        timelineQuery.isLoading ? (
+          <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+            Loading request timeline…
+          </div>
+        ) : timelineQuery.data ? (
+          <RequestTimelineView data={timelineQuery.data} />
+        ) : (
+          <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+            No per-request timeline for benchmark point #{id} — the profile_export.jsonl artifact
+            isn&apos;t stored for this row.
+          </div>
+        )
       ) : (
         <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
           <ExpandableChart
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
new file mode 100644
index 00000000..bcbe105a
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -0,0 +1,821 @@
+'use client';
+
+import { useCallback, useMemo, useRef, useState } from 'react';
+
+import { type RequestRecord, type RequestTimeline } from '@/hooks/api/use-request-timeline';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+
+/**
+ * Gantt-style request timeline for one agentic benchmark point.
+ *
+ * Rows are conversations (or workers — toggle in the header). Bars are
+ * individual HTTP requests, drawn from request_start to request_end with a
+ * thin lead-in segment from credit_issued (load gen queue). Scroll-wheel
+ * zooms, drag pans, hover shows per-request stats.
+ *
+ * The reference for this layout is the agent-timeline in semianalysis-claude-code-proxy.
+ */
+
+type RowMode = 'conversation' | 'worker';
+
+const ROW_MODE_OPTIONS: SegmentedToggleOption<RowMode>[] = [
+  { value: 'conversation', label: 'By conversation', testId: 'timeline-mode-conversation' },
+  { value: 'worker', label: 'By worker', testId: 'timeline-mode-worker' },
+];
+
+type PhaseFilter = 'all' | 'profiling';
+
+const PHASE_OPTIONS: SegmentedToggleOption<PhaseFilter>[] = [
+  { value: 'profiling', label: 'Profiling', testId: 'timeline-phase-profiling' },
+  { value: 'all', label: 'All (incl. warmup)', testId: 'timeline-phase-all' },
+];
+
+/** A stable color palette indexed by row-key hash. */
+const ROW_COLORS = [
+  '#3b82f6',
+  '#ef4444',
+  '#10b981',
+  '#f59e0b',
+  '#a855f7',
+  '#06b6d4',
+  '#f97316',
+  '#84cc16',
+  '#ec4899',
+  '#14b8a6',
+  '#8b5cf6',
+  '#eab308',
+];
+
+/** Phase color overlay drawn as a thin strip at the bottom of each bar. */
+const PHASE_COLORS: Record<string, string> = {
+  profiling: '#22c55e',
+  warmup: '#94a3b8',
+  unknown: '#64748b',
+};
+
+interface Row {
+  key: string;
+  label: string;
+  color: string;
+  requests: RequestRecord[];
+  /** 0 = top-level conversation/worker, 1+ = sub-agent under its parent. */
+  depth: number;
+  /** True if this row is a sub-agent ("Subagent N of parent X"). */
+  isSubagent: boolean;
+}
+
+/**
+ * Conversation ids for subagent calls look like
+ *   <parent_cid>::sa:subagent_<N>_<hash>
+ * Split into the parent cid and a sub-agent label (or the whole thing if
+ * this is a top-level conversation).
+ */
+function splitCid(cid: string): { parent: string; subagent: string | null } {
+  const sep = cid.indexOf('::sa:');
+  if (sep === -1) return { parent: cid, subagent: null };
+  return { parent: cid.slice(0, sep), subagent: cid.slice(sep + 5) };
+}
+
+/** Group requests into rows; in conversation mode subagents nest under parents. */
+function buildRows(requests: RequestRecord[], mode: RowMode): Row[] {
+  const groups = new Map<string, RequestRecord[]>();
+  for (const r of requests) {
+    const key = mode === 'conversation' ? r.cid : r.wid;
+    let list = groups.get(key);
+    if (!list) {
+      list = [];
+      groups.set(key, list);
+    }
+    list.push(r);
+  }
+
+  if (mode !== 'conversation') {
+    // Worker mode: flat rows, sorted by first activity.
+    const rows: Row[] = [];
+    let i = 0;
+    for (const [key, list] of groups) {
+      list.sort((a, b) => a.start - b.start);
+      rows.push({
+        key,
+        label: shortenWid(key),
+        color: ROW_COLORS[i % ROW_COLORS.length]!,
+        requests: list,
+        depth: 0,
+        isSubagent: false,
+      });
+      i++;
+    }
+    rows.sort((a, b) => a.requests[0]!.start - b.requests[0]!.start);
+    return rows;
+  }
+
+  // Conversation mode: build a parent → [subagents] tree so each parent
+  // group renders as one parent row followed by its sub-agent rows. Color
+  // is shared inside a tree so the visual grouping reads.
+  interface Tree {
+    parentCid: string;
+    parentRow: { key: string; requests: RequestRecord[] } | null;
+    subagents: Map<string, RequestRecord[]>; // subagent label → requests
+    firstStart: number;
+  }
+  const trees = new Map<string, Tree>();
+  for (const [cid, list] of groups) {
+    list.sort((a, b) => a.start - b.start);
+    const { parent, subagent } = splitCid(cid);
+    let tree = trees.get(parent);
+    if (!tree) {
+      tree = {
+        parentCid: parent,
+        parentRow: null,
+        subagents: new Map(),
+        firstStart: Number.POSITIVE_INFINITY,
+      };
+      trees.set(parent, tree);
+    }
+    if (subagent === null) {
+      tree.parentRow = { key: cid, requests: list };
+    } else {
+      tree.subagents.set(subagent, list);
+    }
+    const earliest = list[0]!.start;
+    if (earliest < tree.firstStart) tree.firstStart = earliest;
+  }
+
+  const sortedTrees = [...trees.values()].toSorted((a, b) => a.firstStart - b.firstStart);
+  const rows: Row[] = [];
+  let colorIdx = 0;
+  for (const tree of sortedTrees) {
+    const color = ROW_COLORS[colorIdx % ROW_COLORS.length]!;
+    colorIdx++;
+    if (tree.parentRow) {
+      rows.push({
+        key: tree.parentRow.key,
+        label: shortenCid(tree.parentCid),
+        color,
+        requests: tree.parentRow.requests,
+        depth: 0,
+        isSubagent: false,
+      });
+    } else {
+      // Pseudo-parent header so orphan subagents still render under
+      // something they belong to.
+      rows.push({
+        key: `__parent_${tree.parentCid}`,
+        label: shortenCid(tree.parentCid),
+        color,
+        requests: [],
+        depth: 0,
+        isSubagent: false,
+      });
+    }
+    const subagentEntries = [...tree.subagents.entries()].toSorted(
+      (a, b) => a[1][0]!.start - b[1][0]!.start,
+    );
+    for (const [saLabel, list] of subagentEntries) {
+      rows.push({
+        key: `${tree.parentCid}::${saLabel}`,
+        label: `↳ ${formatSubagentLabel(saLabel)}`,
+        color,
+        requests: list,
+        depth: 1,
+        isSubagent: true,
+      });
+    }
+  }
+  return rows;
+}
+
+/** `subagent_001_bf1c5c16` → `subagent 001 · bf1c` (compact, readable). */
+function formatSubagentLabel(raw: string): string {
+  const m = /^subagent_(\d+)_([0-9a-f]+)$/i.exec(raw);
+  if (!m) return raw;
+  return `subagent ${m[1]} · ${m[2]!.slice(0, 4)}`;
+}
+
+function shortenCid(cid: string): string {
+  if (cid.length <= 12) return cid;
+  return `${cid.slice(0, 8)}…${cid.slice(-4)}`;
+}
+
+function shortenWid(wid: string): string {
+  // worker_4ae87bea → w_4ae8
+  return wid.replace(/^worker_/, 'w_').slice(0, 12);
+}
+
+/** Format ns offset → "+12.3s" / "+1.2m". */
+function formatTickLabel(ns: number): string {
+  const ms = ns / 1e6;
+  if (ms < 1000) return `+${ms.toFixed(0)}ms`;
+  if (ms < 60_000) return `+${(ms / 1000).toFixed(ms < 10_000 ? 1 : 0)}s`;
+  return `+${(ms / 60_000).toFixed(1)}m`;
+}
+
+function formatDuration(ms: number): string {
+  if (ms < 1000) return `${ms.toFixed(0)}ms`;
+  if (ms < 60_000) return `${(ms / 1000).toFixed(2)}s`;
+  return `${(ms / 60_000).toFixed(2)}m`;
+}
+
+/** Number of values in a sorted ascending array that are <= target. */
+function countLeq(sorted: number[], target: number): number {
+  let lo = 0;
+  let hi = sorted.length;
+  while (lo < hi) {
+    const mid = (lo + hi) >>> 1;
+    if (sorted[mid]! <= target) lo = mid + 1;
+    else hi = mid;
+  }
+  return lo;
+}
+/** Number of values in a sorted ascending array that are < target. */
+function countLt(sorted: number[], target: number): number {
+  let lo = 0;
+  let hi = sorted.length;
+  while (lo < hi) {
+    const mid = (lo + hi) >>> 1;
+    if (sorted[mid]! < target) lo = mid + 1;
+    else hi = mid;
+  }
+  return lo;
+}
+
+interface TooltipData {
+  x: number;
+  y: number;
+  row: Row;
+  req: RequestRecord;
+}
+
+function Tooltip({ data }: { data: TooltipData }) {
+  const { row, req } = data;
+  const totalMs = (req.end - req.start) / 1e6;
+  const queueMs = (req.start - req.credit) / 1e6;
+  return (
+    <div
+      className="fixed z-50 pointer-events-none rounded-md border border-border bg-card p-2.5 shadow-lg text-[11px]"
+      style={{ left: data.x + 12, top: data.y - 10, maxWidth: 280 }}
+    >
+      <div className="flex items-center gap-2 font-medium text-foreground">
+        <span className="inline-block w-2 h-2 rounded-sm" style={{ backgroundColor: row.color }} />
+        <span className="truncate">{row.label}</span>
+        <span className="text-muted-foreground">· turn {req.ti}</span>
+        {req.cancelled && <span className="text-destructive">· cancelled</span>}
+      </div>
+      <div className="mt-1.5 grid grid-cols-2 gap-x-3 gap-y-0.5 text-muted-foreground">
+        <span>Total</span>
+        <span className="text-foreground text-right tabular-nums">{formatDuration(totalMs)}</span>
+        <span>Queue wait</span>
+        <span className="text-foreground text-right tabular-nums">
+          {queueMs > 0.5 ? formatDuration(queueMs) : '—'}
+        </span>
+        {req.ttftMs !== null && (
+          <>
+            <span>TTFT</span>
+            <span className="text-foreground text-right tabular-nums">
+              {formatDuration(req.ttftMs)}
+            </span>
+          </>
+        )}
+        {req.isl !== null && (
+          <>
+            <span>ISL</span>
+            <span className="text-foreground text-right tabular-nums">
+              {req.isl.toLocaleString()}
+            </span>
+          </>
+        )}
+        {req.osl !== null && (
+          <>
+            <span>OSL</span>
+            <span className="text-foreground text-right tabular-nums">
+              {req.osl.toLocaleString()}
+            </span>
+          </>
+        )}
+        <span>Phase</span>
+        <span className="text-foreground text-right">{req.phase}</span>
+        {req.ad > 0 && (
+          <>
+            <span>Agent depth</span>
+            <span className="text-foreground text-right tabular-nums">{req.ad}</span>
+          </>
+        )}
+        <span>Worker</span>
+        <span className="text-foreground text-right truncate">{shortenWid(req.wid)}</span>
+      </div>
+      <div className="mt-1.5 pt-1 border-t border-border/40 text-[10px] text-muted-foreground">
+        Started at {formatTickLabel(req.start)}
+      </div>
+    </div>
+  );
+}
+
+export function RequestTimelineView({ data }: { data: RequestTimeline }) {
+  const [rowMode, setRowMode] = useState<RowMode>('conversation');
+  const [phaseFilter, setPhaseFilter] = useState<PhaseFilter>('profiling');
+  const [tooltip, setTooltip] = useState<TooltipData | null>(null);
+  const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null);
+
+  // Apply phase filter, then group into rows.
+  const filtered = useMemo(
+    () =>
+      phaseFilter === 'all' ? data.requests : data.requests.filter((r) => r.phase === 'profiling'),
+    [data.requests, phaseFilter],
+  );
+  const rows = useMemo(() => buildRows(filtered, rowMode), [filtered, rowMode]);
+
+  // Pre-sort the timestamp columns so the cursor-time stats popover can
+  // count "running / waiting at time t" in O(log n). With a few hundred
+  // requests this is overkill — but it stays smooth on huge runs too.
+  const sortedTimes = useMemo(() => {
+    const credits = filtered.map((r) => r.credit).toSorted((a, b) => a - b);
+    const starts = filtered.map((r) => r.start).toSorted((a, b) => a - b);
+    const ends = filtered.map((r) => r.end).toSorted((a, b) => a - b);
+    return { credits, starts, ends };
+  }, [filtered]);
+
+  // Cursor state (vertical line + stats popover). null when the mouse
+  // isn't over the chart. xPx is svg-local; tNs is the ns offset from
+  // dataStart that the cursor is pointing at.
+  const [cursor, setCursor] = useState<{
+    xPx: number;
+    tNs: number;
+    clientX: number;
+    clientY: number;
+  } | null>(null);
+
+  // Timeline extent (clamped to actual data — if we filtered out warmup
+  // the visible window should shrink to just the profiling phase).
+  const dataStart = filtered.length === 0 ? 0 : Math.min(...filtered.map((r) => r.credit));
+  const dataEnd = filtered.length === 0 ? 1 : Math.max(...filtered.map((r) => r.end));
+  const totalNs = Math.max(dataEnd - dataStart, 1);
+
+  // Visible window state (ns offsets, relative to dataStart).
+  const [viewStart, setViewStart] = useState(0);
+  const [viewEnd, setViewEnd] = useState<number | null>(null);
+  const vStart = viewStart;
+  const vEnd = viewEnd ?? totalNs;
+  const visibleDur = Math.max(vEnd - vStart, 1);
+  const isZoomed = viewEnd !== null;
+
+  // Layout
+  const LABEL_WIDTH = 160;
+  const ROW_HEIGHT = 22;
+  const ROW_GAP = 3;
+  const HEADER_HEIGHT = 24;
+  const PADDING_RIGHT = 12;
+  const chartWidth = 920;
+  const svgHeight = HEADER_HEIGHT + rows.length * (ROW_HEIGHT + ROW_GAP) + 6;
+  const scale = (chartWidth - PADDING_RIGHT) / visibleDur;
+  // Local coords: convert ns offset from dataStart to x px.
+  const xOf = (ns: number) => (ns - dataStart - vStart) * scale;
+
+  // Time-axis ticks (~8 across visible window, snapped to nice second multiples).
+  const niceMs = [
+    100, 250, 500, 1000, 2000, 5000, 10_000, 30_000, 60_000, 120_000, 300_000, 600_000, 1_800_000,
+  ];
+  const targetMs = visibleDur / 1e6 / 8;
+  const tickMs = niceMs.find((n) => n >= targetMs) ?? targetMs;
+  const tickNs = tickMs * 1e6;
+  const ticks: number[] = [];
+  const tickStart = Math.floor(vStart / tickNs) * tickNs;
+  for (let t = tickStart; t <= vEnd + tickNs; t += tickNs) {
+    if (t >= vStart && t <= vEnd) ticks.push(t);
+  }
+
+  const handleWheel = useCallback(
+    (e: React.WheelEvent<SVGSVGElement>) => {
+      e.preventDefault();
+      const rect = e.currentTarget.getBoundingClientRect();
+      const mouseX = e.clientX - rect.left;
+      const mouseRatio = Math.max(0, Math.min(1, mouseX / (chartWidth - PADDING_RIGHT)));
+      const curStart = vStart;
+      const curEnd = vEnd;
+      const curDur = curEnd - curStart;
+      const factor = e.deltaY > 0 ? 1.2 : 1 / 1.2;
+      const newDur = Math.min(Math.max(curDur * factor, totalNs * 0.001), totalNs);
+      const pivot = curStart + mouseRatio * curDur;
+      let newStart = pivot - mouseRatio * newDur;
+      let newEnd = pivot + (1 - mouseRatio) * newDur;
+      if (newStart < 0) {
+        newEnd -= newStart;
+        newStart = 0;
+      }
+      if (newEnd > totalNs) {
+        newStart -= newEnd - totalNs;
+        newEnd = totalNs;
+        if (newStart < 0) newStart = 0;
+      }
+      if (newEnd - newStart >= totalNs * 0.99) {
+        setViewStart(0);
+        setViewEnd(null);
+      } else {
+        setViewStart(newStart);
+        setViewEnd(newEnd);
+      }
+    },
+    [vStart, vEnd, totalNs, chartWidth],
+  );
+
+  const handleMouseDown = useCallback(
+    (e: React.MouseEvent<SVGSVGElement>) => {
+      if (e.button !== 0) return;
+      dragRef.current = { startX: e.clientX, vs: vStart, ve: vEnd };
+    },
+    [vStart, vEnd],
+  );
+
+  const handleMouseMove = useCallback(
+    (e: React.MouseEvent<SVGSVGElement>) => {
+      // Dragging takes precedence over cursor tracking — panning the view.
+      if (dragRef.current) {
+        const dx = e.clientX - dragRef.current.startX;
+        const nsPerPx = visibleDur / (chartWidth - PADDING_RIGHT);
+        const delta = -dx * nsPerPx;
+        let ns = dragRef.current.vs + delta;
+        let ne = dragRef.current.ve + delta;
+        const dur = ne - ns;
+        if (ns < 0) {
+          ns = 0;
+          ne = dur;
+        }
+        if (ne > totalNs) {
+          ne = totalNs;
+          ns = totalNs - dur;
+          if (ns < 0) ns = 0;
+        }
+        setViewStart(ns);
+        setViewEnd(ne);
+        setTooltip(null);
+        setCursor(null);
+        return;
+      }
+      // Track the cursor position in svg-local px and the matching ns offset
+      // so the crosshair + stats popover can render. Clamped to the chart
+      // plot area (don't show a cursor on the axis labels gutter).
+      const rect = e.currentTarget.getBoundingClientRect();
+      const xPx = Math.max(0, Math.min(chartWidth - PADDING_RIGHT, e.clientX - rect.left));
+      const nsPerPx = visibleDur / (chartWidth - PADDING_RIGHT);
+      const tNs = vStart + xPx * nsPerPx;
+      setCursor({ xPx, tNs, clientX: e.clientX, clientY: e.clientY });
+    },
+    [visibleDur, chartWidth, totalNs, vStart],
+  );
+
+  const handleMouseUp = useCallback(() => {
+    dragRef.current = null;
+  }, []);
+
+  const handleMouseLeave = useCallback(() => {
+    dragRef.current = null;
+    setCursor(null);
+  }, []);
+
+  const resetZoom = useCallback(() => {
+    setViewStart(0);
+    setViewEnd(null);
+  }, []);
+
+  if (rows.length === 0) {
+    return (
+      <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+        No requests in the current filter.
+      </div>
+    );
+  }
+
+  const totalRequests = filtered.length;
+
+  return (
+    <div className="space-y-3">
+      {/* Controls */}
+      <div className="flex flex-wrap items-center gap-2">
+        <SegmentedToggle
+          value={rowMode}
+          options={ROW_MODE_OPTIONS}
+          onValueChange={setRowMode}
+          ariaLabel="Row mode"
+          testId="timeline-row-mode"
+          buttonClassName="px-2.5 py-1 text-xs"
+        />
+        <SegmentedToggle
+          value={phaseFilter}
+          options={PHASE_OPTIONS}
+          onValueChange={setPhaseFilter}
+          ariaLabel="Phase filter"
+          testId="timeline-phase-filter"
+          buttonClassName="px-2.5 py-1 text-xs"
+        />
+        <span className="ml-auto text-xs text-muted-foreground">
+          {totalRequests} request{totalRequests === 1 ? '' : 's'} · {rows.length}{' '}
+          {rowMode === 'conversation' ? 'conversations' : 'workers'} · span{' '}
+          {formatDuration((dataEnd - dataStart) / 1e6)}
+          {isZoomed && (
+            <>
+              {' · '}
+              <button type="button" onClick={resetZoom} className="text-foreground hover:underline">
+                reset zoom
+              </button>
+            </>
+          )}
+        </span>
+      </div>
+
+      {/* Chart container */}
+      <div className="rounded-md border border-border/60 bg-card overflow-hidden">
+        <div className="flex">
+          {/* Label column — sticky, doesn't scroll horizontally with the chart. */}
+          <div
+            className="flex-shrink-0 border-r border-border/60 bg-card/80"
+            style={{ width: LABEL_WIDTH }}
+          >
+            <div
+              className="border-b border-border/60 flex items-end px-2 pb-1"
+              style={{ height: HEADER_HEIGHT }}
+            >
+              <span className="text-[9px] font-mono font-bold uppercase tracking-[0.15em] text-muted-foreground">
+                {rowMode === 'conversation' ? 'Conversation' : 'Worker'}
+              </span>
+            </div>
+            {rows.map((row) => (
+              <div
+                key={row.key}
+                className="flex items-center gap-1.5 overflow-hidden pr-2"
+                style={{
+                  height: ROW_HEIGHT + ROW_GAP,
+                  paddingLeft: 8 + row.depth * 12,
+                }}
+              >
+                <span
+                  className="inline-block w-1 h-3 rounded-sm flex-shrink-0"
+                  style={{
+                    backgroundColor: row.color,
+                    opacity: row.isSubagent ? 0.55 : 1,
+                  }}
+                />
+                <span
+                  className="text-[10px] font-mono truncate"
+                  style={{ color: row.color, opacity: row.isSubagent ? 0.85 : 1 }}
+                >
+                  {row.label}
+                </span>
+                <span className="text-[9px] font-mono text-muted-foreground ml-auto shrink-0">
+                  {row.requests.length > 0 ? row.requests.length : '—'}
+                </span>
+              </div>
+            ))}
+          </div>
+
+          {/* Scrollable SVG */}
+          <div className="flex-1 overflow-x-auto">
+            <svg
+              width={chartWidth}
+              height={svgHeight}
+              className="block"
+              style={{ cursor: isZoomed ? 'grab' : 'crosshair' }}
+              onWheel={handleWheel}
+              onMouseDown={handleMouseDown}
+              onMouseMove={handleMouseMove}
+              onMouseUp={handleMouseUp}
+              onMouseLeave={handleMouseLeave}
+            >
+              {/* Header / time-axis baseline */}
+              <line
+                x1={0}
+                y1={HEADER_HEIGHT}
+                x2={chartWidth}
+                y2={HEADER_HEIGHT}
+                stroke="currentColor"
+                opacity={0.15}
+              />
+
+              {/* Time axis ticks */}
+              {ticks.map((t) => {
+                // Convert visible-window ns offset → x px (the tick array
+                // is already in dataStart-relative coords).
+                const x = (t - vStart) * scale;
+                return (
+                  <g key={t}>
+                    <line
+                      x1={x}
+                      y1={HEADER_HEIGHT}
+                      x2={x}
+                      y2={svgHeight}
+                      stroke="currentColor"
+                      opacity={0.08}
+                      strokeDasharray="2 4"
+                    />
+                    <text
+                      x={x + 2}
+                      y={HEADER_HEIGHT - 6}
+                      fill="currentColor"
+                      opacity={0.55}
+                      fontSize={9}
+                      fontFamily="ui-monospace, SFMono-Regular, monospace"
+                    >
+                      {formatTickLabel(t)}
+                    </text>
+                  </g>
+                );
+              })}
+
+              {/* Row separators */}
+              {rows.map((row, idx) => (
+                <line
+                  key={`sep-${row.key}`}
+                  x1={0}
+                  y1={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+                  x2={chartWidth}
+                  y2={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+                  stroke="currentColor"
+                  opacity={0.04}
+                />
+              ))}
+
+              {/* Request bars */}
+              {rows.map((row, rowIdx) => {
+                const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2;
+                const barH = ROW_HEIGHT - 4;
+                return row.requests.map((req) => {
+                  const xCredit = xOf(req.credit);
+                  const xStart = xOf(req.start);
+                  const xEnd = xOf(req.end);
+                  // Cull bars entirely outside the visible window so big
+                  // benchmarks don't render thousands of zero-width rects.
+                  if (xEnd < -2 || xCredit > chartWidth + 2) return null;
+                  const runW = Math.max(xEnd - xStart, 1);
+                  const queueW = Math.max(xStart - xCredit, 0);
+                  const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!;
+                  return (
+                    <g
+                      key={`${req.cid}-${req.ti}-${req.start}`}
+                      onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })}
+                      onMouseLeave={() => setTooltip(null)}
+                    >
+                      {/* Queue lead-in (faint) — only drawn when noticeable. */}
+                      {queueW >= 1 && (
+                        <rect
+                          x={xCredit}
+                          y={yTop + barH / 2 - 1}
+                          width={queueW}
+                          height={2}
+                          fill={row.color}
+                          opacity={0.35}
+                        />
+                      )}
+                      {/* Main bar */}
+                      <rect
+                        x={xStart}
+                        y={yTop}
+                        width={runW}
+                        height={barH}
+                        rx={2}
+                        fill={row.color}
+                        opacity={req.cancelled ? 0.35 : row.isSubagent ? 0.6 : 0.85}
+                      />
+                      {/* Phase strip at bottom */}
+                      <rect
+                        x={xStart}
+                        y={yTop + barH - 2}
+                        width={runW}
+                        height={2}
+                        rx={1}
+                        fill={phaseColor}
+                        opacity={0.85}
+                      />
+                      {/* Cancelled X overlay */}
+                      {req.cancelled && runW > 6 && (
+                        <line
+                          x1={xStart + 1}
+                          y1={yTop + 1}
+                          x2={xStart + runW - 1}
+                          y2={yTop + barH - 1}
+                          stroke="currentColor"
+                          strokeWidth={0.7}
+                          opacity={0.6}
+                        />
+                      )}
+                    </g>
+                  );
+                });
+              })}
+
+              {/* Cursor crosshair — drawn on top of bars so it stays visible
+                  through dense rows. Stats popover is rendered as fixed
+                  HTML below the SVG block. */}
+              {cursor && (
+                <line
+                  x1={cursor.xPx}
+                  x2={cursor.xPx}
+                  y1={0}
+                  y2={svgHeight}
+                  stroke="currentColor"
+                  strokeWidth={1}
+                  opacity={0.45}
+                  pointerEvents="none"
+                />
+              )}
+            </svg>
+          </div>
+        </div>
+      </div>
+
+      {/* Footer / legend */}
+      <div className="flex flex-wrap items-center gap-x-4 gap-y-1 px-1 text-[11px] text-muted-foreground">
+        <span className="inline-flex items-center gap-1.5">
+          <span className="inline-block w-3 h-2 rounded-sm bg-current opacity-30" />
+          queue wait
+        </span>
+        <span className="inline-flex items-center gap-1.5">
+          <span className="inline-block w-3 h-2 rounded-sm" style={{ background: '#22c55e' }} />
+          profiling
+        </span>
+        <span className="inline-flex items-center gap-1.5">
+          <span className="inline-block w-3 h-2 rounded-sm" style={{ background: '#94a3b8' }} />
+          warmup
+        </span>
+        <span className="ml-auto opacity-70">scroll to zoom · drag to pan</span>
+      </div>
+
+      {/* Cursor stats popover: count of in-flight / waiting at the cursor's
+          ns offset. Hidden when the user is hovering an individual bar
+          (per-request tooltip wins). */}
+      {cursor && !tooltip && (
+        <CursorPopover
+          cursor={cursor}
+          dataStart={dataStart}
+          startTimes={sortedTimes.starts}
+          endTimes={sortedTimes.ends}
+          creditTimes={sortedTimes.credits}
+        />
+      )}
+
+      {/* Tooltip */}
+      {tooltip && <Tooltip data={tooltip} />}
+    </div>
+  );
+}
+
+function CursorPopover({
+  cursor,
+  dataStart,
+  startTimes,
+  endTimes,
+  creditTimes,
+}: {
+  cursor: { xPx: number; tNs: number; clientX: number; clientY: number };
+  dataStart: number;
+  startTimes: number[];
+  endTimes: number[];
+  creditTimes: number[];
+}) {
+  // At time t (ns from dataStart, here represented as t = tNs):
+  //   running  = #(start <= t) - #(end < t)
+  //   waiting  = #(credit <= t) - #(start <= t)
+  //   completed= #(end <= t)
+  const t = cursor.tNs;
+  const startsLeq = countLeq(startTimes, t);
+  const endsLt = countLt(endTimes, t);
+  const creditsLeq = countLeq(creditTimes, t);
+  const endsLeq = countLeq(endTimes, t);
+  const running = Math.max(0, startsLeq - endsLt);
+  const waiting = Math.max(0, creditsLeq - startsLeq);
+  const completed = endsLeq;
+  const inflight = running + waiting;
+  // Absolute wall-clock seconds since the timeline origin (dataStart).
+  const tSec = t / 1e9;
+  // Position the popover near the cursor without overflowing the viewport.
+  // 200 px wide; flip to the left of the cursor if it would clip the right.
+  const wantLeft = cursor.clientX + 14;
+  const left =
+    typeof window === 'undefined' || wantLeft + 220 < window.innerWidth
+      ? wantLeft
+      : cursor.clientX - 220;
+  return (
+    <div
+      className="fixed z-40 pointer-events-none rounded-md border border-border bg-card/95 backdrop-blur p-2 shadow-lg text-[11px] font-mono"
+      style={{ left, top: cursor.clientY - 60, minWidth: 180 }}
+    >
+      <div className="flex justify-between gap-3 text-foreground">
+        <span className="text-muted-foreground">t =</span>
+        <span className="tabular-nums">
+          {tSec < 60 ? `${tSec.toFixed(3)} s` : `${(tSec / 60).toFixed(3)} m`}
+        </span>
+      </div>
+      <div className="mt-1 pt-1 border-t border-border/40 grid grid-cols-2 gap-x-3 gap-y-0.5 text-muted-foreground">
+        <span>In flight</span>
+        <span className="text-foreground text-right tabular-nums">{inflight}</span>
+        <span className="pl-3 text-[10px]">running</span>
+        <span className="text-foreground text-right tabular-nums">{running}</span>
+        <span className="pl-3 text-[10px]">waiting</span>
+        <span className="text-foreground text-right tabular-nums">{waiting}</span>
+        <span>Completed</span>
+        <span className="text-foreground text-right tabular-nums">{completed}</span>
+      </div>
+      {/* dataStart is informational — the displayed t is relative to it. */}
+      <div className="mt-1 pt-1 border-t border-border/40 text-[9px] text-muted-foreground">
+        relative to t₀ ({(dataStart / 1e9).toFixed(0)}s wall-clock)
+      </div>
+    </div>
+  );
+}
diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts
new file mode 100644
index 00000000..d3ceaab8
--- /dev/null
+++ b/packages/app/src/hooks/api/use-request-timeline.ts
@@ -0,0 +1,59 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface RequestRecord {
+  /** Conversation id (groups turns of one agent session). */
+  cid: string;
+  /** Zero-based turn index within the conversation. */
+  ti: number;
+  /** Worker id (concurrency slot that handled this request). */
+  wid: string;
+  /** Sub-agent depth (0 = top-level). */
+  ad: number;
+  /** `warmup` or `profiling`. */
+  phase: string;
+  /** ns offset from timeline.startNs. Load gen decided to dispatch. */
+  credit: number;
+  /** ns offset from timeline.startNs. HTTP send started. */
+  start: number;
+  /** ns offset from timeline.startNs. First server acknowledgement (or null). */
+  ack: number | null;
+  /** ns offset from timeline.startNs. Last byte received. */
+  end: number;
+  ttftMs: number | null;
+  isl: number | null;
+  osl: number | null;
+  cancelled: boolean;
+}
+
+export interface RequestTimeline {
+  version: number;
+  startNs: number;
+  endNs: number;
+  durationS: number;
+  requests: RequestRecord[];
+}
+
+async function fetchRequestTimeline(
+  id: number,
+  signal?: AbortSignal,
+): Promise<RequestTimeline | null> {
+  const res = await fetch(`/api/v1/request-timeline?id=${id}`, { signal });
+  if (res.status === 404) return null;
+  if (!res.ok) throw new Error(`request-timeline ${res.status}`);
+  return (await res.json()) as RequestTimeline;
+}
+
+/**
+ * Lazy-fetch the per-request Gantt timeline for one agentic point.
+ * Enabled only when the caller opts in (e.g. the timeline view becomes
+ * active), so the payload (~30 KB per point) isn't paid for every page load.
+ */
+export function useRequestTimeline(id: number | null, enabled = false) {
+  return useQuery({
+    queryKey: ['request-timeline', id] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) =>
+      id ? fetchRequestTimeline(id, signal) : Promise.resolve(null),
+    enabled: enabled && Boolean(id),
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/db/migrations/010_agentic_request_timeline.sql b/packages/db/migrations/010_agentic_request_timeline.sql
new file mode 100644
index 00000000..756b775e
--- /dev/null
+++ b/packages/db/migrations/010_agentic_request_timeline.sql
@@ -0,0 +1,15 @@
+-- Pre-computed per-request timeline for the agentic detail page.
+--
+-- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one
+-- holds a thin per-request array extracted from `profile_export_jsonl_gz`
+-- so the detail page can render a Gantt-style swimlane of every request
+-- (one bar per conversation turn) without re-parsing the JSONL on every
+-- page load.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored timeline was produced by an older
+-- algorithm. Null when the timeline hasn't been computed yet; the API
+-- falls back to parsing the blob in that case.
+
+alter table agentic_trace_replay
+  add column request_timeline jsonb;
diff --git a/packages/db/package.json b/packages/db/package.json
index f97c442a..710089f1 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -21,6 +21,7 @@
     "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts",
     "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts",
     "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts",
+    "db:backfill-request-timeline": "dotenv -e ../../.env -- tsx src/backfill-request-timeline.ts",
     "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts",
     "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts",
     "db:reset": "dotenv -e ../../.env -- tsx src/reset-db.ts",
diff --git a/packages/db/src/backfill-request-timeline.ts b/packages/db/src/backfill-request-timeline.ts
new file mode 100644
index 00000000..327099d0
--- /dev/null
+++ b/packages/db/src/backfill-request-timeline.ts
@@ -0,0 +1,144 @@
+/**
+ * Backfill `agentic_trace_replay.request_timeline` for rows that are
+ * missing it or were computed by an older `REQUEST_TIMELINE_VERSION`.
+ *
+ * The ingest path now computes the timeline inline, but existing rows
+ * (and rows whose computation logic has since changed) still need this
+ * pass. Run after applying migration 010 and any time the version bumps.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-request-timeline
+ *     [--limit N]   only process the first N candidate rows
+ *     [--force]     recompute every row, even if version already matches
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import {
+  REQUEST_TIMELINE_VERSION,
+  computeRequestTimeline,
+} from './etl/compute-request-timeline.js';
+import { createAdminSql } from './etl/db-utils.js';
+
+interface CliFlags {
+  limit: number | null;
+  force: boolean;
+}
+
+function parseFlags(): CliFlags {
+  let limit: number | null = null;
+  let force = false;
+  for (let i = 2; i < process.argv.length; i++) {
+    const arg = process.argv[i]!;
+    if (arg === '--force') force = true;
+    else if (arg === '--limit') {
+      const next = process.argv[++i];
+      if (!next || Number.isNaN(Number(next))) {
+        console.error('--limit requires a numeric argument');
+        process.exit(1);
+      }
+      limit = Number(next);
+    }
+  }
+  return { limit, force };
+}
+
+const flags = parseFlags();
+
+const sql = createAdminSql({
+  noSsl: hasNoSslFlag(),
+  max: 1,
+  onnotice: () => {},
+});
+
+async function main(): Promise<void> {
+  console.log('=== backfill-request-timeline ===');
+  console.log(`  REQUEST_TIMELINE_VERSION = ${REQUEST_TIMELINE_VERSION}`);
+  console.log(`  force = ${flags.force}`);
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // Only rows with a profile_export blob can produce a timeline. Rows
+  // without the blob keep `request_timeline` null and the API serves them
+  // as "no timeline data".
+  const candidates = flags.force
+    ? await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where profile_export_jsonl_gz is not null
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `
+    : await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where profile_export_jsonl_gz is not null
+          and (
+            request_timeline is null
+            or coalesce((request_timeline->>'version')::int, -1) <> ${REQUEST_TIMELINE_VERSION}
+          )
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `;
+
+  if (candidates.length === 0) {
+    console.log('\n  Nothing to do — all rows up to date.');
+    return;
+  }
+
+  console.log(`\n  ${candidates.length} candidate row(s).`);
+  if (!hasYesFlag()) {
+    const ok = await confirm('\nProceed? (y/N) ');
+    if (!ok) {
+      console.log('Aborted.');
+      return;
+    }
+  }
+
+  let ok = 0;
+  let failed = 0;
+  const t0 = Date.now();
+  for (const { id } of candidates) {
+    const start = Date.now();
+    try {
+      const [row] = await sql<{ profile_export_jsonl_gz: Buffer | null }[]>`
+        select profile_export_jsonl_gz
+        from agentic_trace_replay
+        where id = ${id}
+      `;
+      if (!row) {
+        console.warn(`  id=${id}: row vanished, skipping`);
+        continue;
+      }
+      const timeline = computeRequestTimeline(row.profile_export_jsonl_gz);
+      await sql`
+        update agentic_trace_replay
+        set request_timeline = ${
+          timeline === null
+            ? null
+            : sql.json(structuredClone(timeline) as unknown as Parameters<typeof sql.json>[0])
+        }
+        where id = ${id}
+      `;
+      ok++;
+      const elapsed = Math.round((Date.now() - start) / 1000);
+      const elapsedTotal = Math.round((Date.now() - t0) / 1000);
+      console.log(
+        `  ✓ id=${id} (${elapsed}s, ${ok}/${candidates.length} done, ${elapsedTotal}s total)`,
+      );
+    } catch (error) {
+      failed++;
+      console.error(`  ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`);
+    }
+  }
+
+  const totalSec = Math.round((Date.now() - t0) / 1000);
+  console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`);
+  if (failed > 0) process.exitCode = 1;
+}
+
+main()
+  .catch((error) => {
+    console.error('backfill-request-timeline failed:', error);
+    process.exitCode = 1;
+  })
+  .finally(() => sql.end());
diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts
new file mode 100644
index 00000000..64512aca
--- /dev/null
+++ b/packages/db/src/etl/compute-request-timeline.test.ts
@@ -0,0 +1,153 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { REQUEST_TIMELINE_VERSION, computeRequestTimeline } from './compute-request-timeline.js';
+
+interface SyntheticRequest {
+  cid: string;
+  ti: number;
+  wid?: string;
+  ad?: number;
+  phase?: string;
+  credit: number;
+  start: number;
+  end: number;
+  ack?: number | null;
+  ttftMs?: number | null;
+  isl?: number | null;
+  osl?: number | null;
+  cancelled?: boolean;
+}
+
+function makeBlob(requests: SyntheticRequest[]) {
+  const lines = requests.map((r) =>
+    JSON.stringify({
+      metadata: {
+        conversation_id: r.cid,
+        turn_index: r.ti,
+        worker_id: r.wid ?? 'worker_default',
+        agent_depth: r.ad ?? 0,
+        benchmark_phase: r.phase ?? 'profiling',
+        credit_issued_ns: r.credit,
+        request_start_ns: r.start,
+        ...(r.ack === undefined ? {} : { request_ack_ns: r.ack }),
+        request_end_ns: r.end,
+        was_cancelled: r.cancelled ?? false,
+      },
+      metrics: {
+        time_to_first_token: r.ttftMs === null ? null : { value: r.ttftMs ?? 50, unit: 'ms' },
+        input_sequence_length: { value: r.isl ?? 100, unit: 'tokens' },
+        output_sequence_length: { value: r.osl ?? 10, unit: 'tokens' },
+      },
+    }),
+  );
+  return gzipSync(Buffer.from(lines.join('\n')));
+}
+
+describe('computeRequestTimeline', () => {
+  it('returns null when the blob is null', () => {
+    expect(computeRequestTimeline(null)).toBeNull();
+  });
+
+  it('returns null on a malformed (non-gzip) blob', () => {
+    expect(computeRequestTimeline(Buffer.from('not-gzip'))).toBeNull();
+  });
+
+  it('returns null when the blob has no parseable records', () => {
+    expect(computeRequestTimeline(gzipSync(Buffer.from('\n\n')))).toBeNull();
+  });
+
+  it('returns the current REQUEST_TIMELINE_VERSION in the bundle', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([{ cid: 'a', ti: 0, credit: 1000, start: 2000, end: 3000 }]),
+    );
+    expect(tl?.version).toBe(REQUEST_TIMELINE_VERSION);
+  });
+
+  it('shifts ns timestamps to be relative to the earliest credit_issued', () => {
+    // Two requests with absolute ns starting at 1_000_000_000.
+    const tl = computeRequestTimeline(
+      makeBlob([
+        { cid: 'a', ti: 0, credit: 1_000_000_000, start: 1_001_000_000, end: 1_010_000_000 },
+        { cid: 'a', ti: 1, credit: 1_020_000_000, start: 1_021_000_000, end: 1_030_000_000 },
+      ]),
+    );
+    expect(tl?.startNs).toBe(1_000_000_000);
+    expect(tl?.endNs).toBe(1_030_000_000);
+    expect(tl?.durationS).toBeCloseTo(0.03, 6);
+    expect(tl?.requests[0]?.credit).toBe(0);
+    expect(tl?.requests[0]?.end).toBe(10_000_000);
+    expect(tl?.requests[1]?.start).toBe(21_000_000);
+  });
+
+  it('sorts requests by start time, regardless of input order', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        { cid: 'a', ti: 0, credit: 30, start: 50, end: 60 },
+        { cid: 'a', ti: 1, credit: 0, start: 10, end: 20 },
+        { cid: 'a', ti: 2, credit: 80, start: 90, end: 100 },
+      ]),
+    );
+    expect(tl?.requests.map((r) => r.start)).toEqual([10, 50, 90]);
+  });
+
+  it('preserves conversation/worker grouping fields', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        {
+          cid: 'conv-A',
+          ti: 5,
+          wid: 'worker_abcd1234',
+          ad: 2,
+          phase: 'profiling',
+          credit: 0,
+          start: 10,
+          end: 100,
+        },
+      ]),
+    );
+    const r = tl?.requests[0]!;
+    expect(r.cid).toBe('conv-A');
+    expect(r.ti).toBe(5);
+    expect(r.wid).toBe('worker_abcd1234');
+    expect(r.ad).toBe(2);
+    expect(r.phase).toBe('profiling');
+  });
+
+  it('preserves the cancelled flag and TTFT/ISL/OSL metrics', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        {
+          cid: 'a',
+          ti: 0,
+          credit: 0,
+          start: 10,
+          end: 100,
+          ttftMs: 25.5,
+          isl: 1024,
+          osl: 256,
+          cancelled: true,
+        },
+      ]),
+    );
+    const r = tl?.requests[0]!;
+    expect(r.cancelled).toBe(true);
+    expect(r.ttftMs).toBeCloseTo(25.5, 6);
+    expect(r.isl).toBe(1024);
+    expect(r.osl).toBe(256);
+  });
+
+  it('skips records missing both credit_issued_ns and request_start_ns', () => {
+    // Build a record with only request_end_ns — the helper rejects it.
+    const broken = gzipSync(
+      Buffer.from(
+        JSON.stringify({
+          metadata: { conversation_id: 'a', turn_index: 0, request_end_ns: 1234 },
+          metrics: {},
+        }),
+      ),
+    );
+    expect(computeRequestTimeline(broken)).toBeNull();
+  });
+});
diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts
new file mode 100644
index 00000000..a1134f7a
--- /dev/null
+++ b/packages/db/src/etl/compute-request-timeline.ts
@@ -0,0 +1,182 @@
+/**
+ * Pre-compute the per-request timeline for the agentic detail page's
+ * Gantt view. Output lands in `agentic_trace_replay.request_timeline`
+ * and is read directly by the timeline API route.
+ *
+ * Shape is a thin array — ~150 bytes per request × ~200 requests per
+ * point ≈ 30 KB per row before JSONB compression. Trivial vs the raw
+ * gzipped JSONL blob (~1-3 MB).
+ *
+ * Versioned so the backfill script knows which rows are stale — bump
+ * `REQUEST_TIMELINE_VERSION` whenever the extraction algorithm changes.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+/** Bump when the extraction algorithm changes — backfill recomputes anything older. */
+export const REQUEST_TIMELINE_VERSION = 1;
+
+export interface RequestRecord {
+  /** Conversation id (groups turns of one agent session). */
+  cid: string;
+  /** Zero-based turn index within the conversation. */
+  ti: number;
+  /** Worker id (concurrency slot that handled this request). */
+  wid: string;
+  /** Sub-agent depth (0 = top-level). */
+  ad: number;
+  /** `warmup` or `profiling`. */
+  phase: string;
+  /** ns offset from timeline.startNs. Load gen decided to dispatch. */
+  credit: number;
+  /** ns offset from timeline.startNs. HTTP send started. */
+  start: number;
+  /** ns offset from timeline.startNs. First server acknowledgement (or null). */
+  ack: number | null;
+  /** ns offset from timeline.startNs. Last byte received. */
+  end: number;
+  /** Time-to-first-token in ms. */
+  ttftMs: number | null;
+  /** Input sequence length (tokens). */
+  isl: number | null;
+  /** Output sequence length (tokens). */
+  osl: number | null;
+  cancelled: boolean;
+}
+
+export interface RequestTimeline {
+  version: number;
+  /** Wall-clock ns of the earliest event (used as the relative-time origin). */
+  startNs: number;
+  /** Wall-clock ns of the latest `request_end_ns`. */
+  endNs: number;
+  /** Total span in seconds. */
+  durationS: number;
+  requests: RequestRecord[];
+}
+
+interface RawMetadata {
+  conversation_id?: string;
+  turn_index?: number;
+  worker_id?: string;
+  agent_depth?: number;
+  benchmark_phase?: string;
+  credit_issued_ns?: number;
+  request_start_ns?: number;
+  request_ack_ns?: number;
+  request_end_ns?: number;
+  was_cancelled?: boolean;
+}
+
+interface RawMetricValue {
+  value?: number;
+}
+
+interface RawRecord {
+  metadata?: RawMetadata;
+  metrics?: {
+    time_to_first_token?: RawMetricValue | number;
+    input_sequence_length?: RawMetricValue | number;
+    output_sequence_length?: RawMetricValue | number;
+  };
+}
+
+/** Pull a numeric metric out of the `{value, unit}` envelope (or a bare number). */
+function readNum(v: unknown): number | undefined {
+  if (typeof v === 'number') return Number.isFinite(v) ? v : undefined;
+  if (v && typeof v === 'object' && 'value' in v) {
+    const inner = (v as { value?: unknown }).value;
+    if (typeof inner === 'number' && Number.isFinite(inner)) return inner;
+  }
+  return undefined;
+}
+
+/**
+ * Parse the gzipped `profile_export.jsonl` blob into a chart-ready
+ * timeline. Returns null on a missing or malformed blob.
+ */
+export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | null {
+  if (!blob) return null;
+  let text: string;
+  try {
+    text = gunzipSync(blob).toString('utf8');
+  } catch {
+    return null;
+  }
+
+  // First pass: parse + collect raw turns; find timeline origin.
+  const raw: {
+    meta: RawMetadata;
+    ttftMs: number | null;
+    isl: number | null;
+    osl: number | null;
+  }[] = [];
+  let originNs = Number.POSITIVE_INFINITY;
+  let endNs = 0;
+
+  for (const line of text.split('\n')) {
+    if (!line) continue;
+    let rec: RawRecord;
+    try {
+      rec = JSON.parse(line) as RawRecord;
+    } catch {
+      continue;
+    }
+    const meta = rec.metadata ?? {};
+    // Use credit_issued_ns when available (the true start of the request's
+    // lifecycle), falling back to request_start_ns. Skip rows missing both.
+    const cStart = meta.credit_issued_ns ?? meta.request_start_ns;
+    const cEnd = meta.request_end_ns;
+    if (typeof cStart !== 'number' || typeof cEnd !== 'number') continue;
+
+    if (cStart < originNs) originNs = cStart;
+    if (cEnd > endNs) endNs = cEnd;
+
+    raw.push({
+      meta,
+      ttftMs: readNum(rec.metrics?.time_to_first_token) ?? null,
+      isl: readNum(rec.metrics?.input_sequence_length) ?? null,
+      osl: readNum(rec.metrics?.output_sequence_length) ?? null,
+    });
+  }
+
+  if (raw.length === 0) return null;
+  if (!Number.isFinite(originNs)) originNs = 0;
+
+  // Second pass: shift timestamps to be relative to originNs (smaller
+  // numbers fit in JSON nicely and the frontend doesn't need bigint math).
+  const requests: RequestRecord[] = [];
+  for (const r of raw) {
+    const m = r.meta;
+    const credit = (m.credit_issued_ns ?? m.request_start_ns ?? originNs) - originNs;
+    const start = (m.request_start_ns ?? m.credit_issued_ns ?? originNs) - originNs;
+    const ack = typeof m.request_ack_ns === 'number' ? m.request_ack_ns - originNs : null;
+    const end = (m.request_end_ns ?? originNs) - originNs;
+    requests.push({
+      cid: m.conversation_id ?? 'unknown',
+      ti: typeof m.turn_index === 'number' ? m.turn_index : 0,
+      wid: m.worker_id ?? 'unknown',
+      ad: typeof m.agent_depth === 'number' ? m.agent_depth : 0,
+      phase: m.benchmark_phase ?? 'unknown',
+      credit,
+      start,
+      ack,
+      end,
+      ttftMs: r.ttftMs,
+      isl: r.isl,
+      osl: r.osl,
+      cancelled: m.was_cancelled === true,
+    });
+  }
+
+  // Stable order so backfill output is deterministic.
+  requests.sort((a, b) => a.start - b.start);
+
+  return {
+    version: REQUEST_TIMELINE_VERSION,
+    startNs: originNs,
+    endNs,
+    durationS: endNs > originNs ? (endNs - originNs) / 1e9 : 0,
+    requests,
+  };
+}
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index f70200ff..8cc03f2a 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -14,6 +14,7 @@ import type postgres from 'postgres';
 
 import { computeAggregateStats } from './compute-aggregate-stats.js';
 import { computeChartSeries } from './compute-chart-series.js';
+import { computeRequestTimeline } from './compute-request-timeline.js';
 
 type Sql = ReturnType<typeof postgres>;
 
@@ -58,13 +59,14 @@ export async function insertTraceReplay(
   const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
   const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
 
-  // Pre-compute the aggregate stats + chart-ready time-series so the
-  // detail page / aggregates view doesn't have to re-parse these blobs on
-  // every request. Both helpers tolerate a null blob and fall back to a
-  // streaming parser for oversized server_metrics blobs.
-  const [aggregateStats, chartSeries] = await Promise.all([
+  // Pre-compute aggregate stats + chart-ready time-series + per-request
+  // timeline so the detail page doesn't have to re-parse these blobs on
+  // every request. Each helper tolerates a null blob and falls back to
+  // a streaming parser for oversized server_metrics blobs.
+  const [aggregateStats, chartSeries, requestTimeline] = await Promise.all([
     computeAggregateStats({ profileBlob: profileGz, serverBlob: metricsJsonGz }),
     computeChartSeries(metricsJsonGz),
+    Promise.resolve(computeRequestTimeline(profileGz)),
   ]);
 
   const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
@@ -76,7 +78,8 @@ export async function insertTraceReplay(
       server_metrics_json_gz,
       server_metrics_json_uncompressed_size,
       aggregate_stats,
-      chart_series
+      chart_series,
+      request_timeline
     )
     values (
       ${profileGz},
@@ -86,7 +89,8 @@ export async function insertTraceReplay(
       ${metricsJsonGz},
       ${metricsJsonSize},
       ${sql.json(structuredClone(aggregateStats) as unknown as Parameters<typeof sql.json>[0])},
-      ${chartSeries === null ? null : sql.json(structuredClone(chartSeries) as unknown as Parameters<typeof sql.json>[0])}
+      ${chartSeries === null ? null : sql.json(structuredClone(chartSeries) as unknown as Parameters<typeof sql.json>[0])},
+      ${requestTimeline === null ? null : sql.json(structuredClone(requestTimeline) as unknown as Parameters<typeof sql.json>[0])}
     )
     returning id
   `;
diff --git a/packages/db/src/queries/request-timeline.ts b/packages/db/src/queries/request-timeline.ts
new file mode 100644
index 00000000..2bd3e251
--- /dev/null
+++ b/packages/db/src/queries/request-timeline.ts
@@ -0,0 +1,48 @@
+/**
+ * Per-request timeline for the agentic detail page's Gantt view.
+ *
+ * Backed by `agentic_trace_replay.request_timeline` (pre-computed at
+ * ingest time, see `etl/compute-request-timeline.ts`). The fast path is
+ * a single SQL row read; the slow path re-computes from
+ * `profile_export_jsonl_gz` and is only taken when the column is missing
+ * or the stored `REQUEST_TIMELINE_VERSION` is stale.
+ */
+
+import {
+  REQUEST_TIMELINE_VERSION,
+  computeRequestTimeline,
+  type RequestTimeline,
+} from '../etl/compute-request-timeline';
+
+import type { DbClient } from '../connection.js';
+
+export type { RequestTimeline, RequestRecord } from '../etl/compute-request-timeline';
+
+interface RawRow {
+  blob: Buffer | null;
+  request_timeline: RequestTimeline | null;
+}
+
+export async function getRequestTimeline(
+  sql: DbClient,
+  benchmarkResultId: number,
+): Promise<RequestTimeline | null> {
+  const rows = (await sql`
+    select
+      atr.profile_export_jsonl_gz as blob,
+      atr.request_timeline
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = ${benchmarkResultId}
+  `) as unknown as RawRow[];
+  const row = rows[0];
+  if (!row) return null;
+
+  // Fast path: pre-computed timeline at the current version.
+  if (row.request_timeline && Number(row.request_timeline.version) === REQUEST_TIMELINE_VERSION) {
+    return row.request_timeline;
+  }
+
+  // Slow path: recompute from the blob (rare — only stale/missing rows).
+  return computeRequestTimeline(row.blob);
+}

From f2618f44d6eafa38bffb3b9b9ec39c5224d62b76 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 22 May 2026 14:21:24 -0500
Subject: [PATCH 41/55] fix(agentic-detail): aggregate vllm metrics across all
 engine series
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The chart_series + aggregate_stats helpers were only reading series[0]
for each metric, which under-counted by Nx on multi-engine DP/PP vllm
deployments (each engine reports its own series in
vllm:num_requests_running, kv_cache_usage_perc, prompt_tokens, etc.).

Worst-case visible effect: for point 206032 (b200, dsv4, conc=24,
8-engine cluster), the queue-depth chart maxed at ~3 while the
per-request timeline correctly showed ~22 concurrent. Other metrics
were similarly clipped — prefix-cache hit rate, throughput, KV util.

Now we sum gauges + counter rates across all engines, and average
kv_cache_usage_perc (since it's a per-engine fraction). After fix, the
same row's peak queue depth reads 24 (running 21 + waiting 3), matching
the timeline.

Bumps STATS_VERSION + CHART_SERIES_VERSION to 2 so the backfill scripts
recompute existing rows; both were re-run against 130/26 rows.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../db/src/etl/compute-chart-series.test.ts   |  80 +++++++++
 packages/db/src/etl/compute-chart-series.ts   | 154 ++++++++++--------
 packages/db/src/queries/agentic-aggregates.ts |  90 ++++++----
 3 files changed, 226 insertions(+), 98 deletions(-)

diff --git a/packages/db/src/etl/compute-chart-series.test.ts b/packages/db/src/etl/compute-chart-series.test.ts
index dafc7200..4c6f8791 100644
--- a/packages/db/src/etl/compute-chart-series.test.ts
+++ b/packages/db/src/etl/compute-chart-series.test.ts
@@ -63,6 +63,48 @@ function makeBlob(opts?: {
   return gzipSync(Buffer.from(json));
 }
 
+/** Build a synthetic per-engine vLLM metric series for the multi-engine test. */
+function buildEngineSeries(engineId: number, baseRunning: number) {
+  const labels = { engine: String(engineId) };
+  return {
+    runningSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, avg: baseRunning },
+        { start_ns: 1e9, avg: baseRunning + 1 },
+      ],
+    },
+    waitingSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, avg: 0 },
+        { start_ns: 1e9, avg: 0 },
+      ],
+    },
+    kvSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, avg: 0.25 },
+        { start_ns: 1e9, avg: 0.5 },
+      ],
+    },
+    promptSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, rate: 100 },
+        { start_ns: 1e9, rate: 200 },
+      ],
+    },
+    genSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, rate: 50 },
+        { start_ns: 1e9, rate: 75 },
+      ],
+    },
+  };
+}
+
 describe('computeChartSeries', () => {
   it('returns null when the blob is null', async () => {
     expect(await computeChartSeries(null)).toBeNull();
@@ -126,4 +168,42 @@ describe('computeChartSeries', () => {
     const result = await computeChartSeries(Buffer.from('not-gzip-data'));
     expect(result).toBeNull();
   });
+
+  it('aggregates gauges + counters across all engine series (DP/PP fix)', async () => {
+    // Simulate a 4-engine deployment: each engine reports its own series for
+    // every metric. Cluster-wide value should be SUM for running/waiting and
+    // counter rates, AVG for kv_cache_usage_perc (per-engine fraction).
+    const engines = [0, 1, 2, 3].map((id) => buildEngineSeries(id, 3)); // running=3 per engine
+    const json = JSON.stringify({
+      metrics: {
+        'vllm:num_requests_running': { series: engines.map((e) => e.runningSlice) },
+        'vllm:num_requests_waiting': { series: engines.map((e) => e.waitingSlice) },
+        'vllm:kv_cache_usage_perc': { series: engines.map((e) => e.kvSlice) },
+        'vllm:prompt_tokens': { series: engines.map((e) => e.promptSlice) },
+        'vllm:generation_tokens': { series: engines.map((e) => e.genSlice) },
+      },
+    });
+    const blob = gzipSync(Buffer.from(json));
+    const cs = await computeChartSeries(blob);
+    expect(cs).not.toBeNull();
+    // queueDepth.running = Σ engines = 4 × 3 = 12 at t=0; 4 × 4 = 16 at t=1
+    expect(cs!.queueDepth).toEqual([
+      { t: 0, running: 12, waiting: 0, total: 12 },
+      { t: 1, running: 16, waiting: 0, total: 16 },
+    ]);
+    // kvCacheUsage stays 0.25, 0.5 (average across engines, all engines reported same value)
+    expect(cs!.kvCacheUsage).toEqual([
+      { t: 0, value: 0.25 },
+      { t: 1, value: 0.5 },
+    ]);
+    // prefillTps = Σ rates = 4 × 100 = 400; then 4 × 200 = 800
+    expect(cs!.prefillTps).toEqual([
+      { t: 0, value: 400 },
+      { t: 1, value: 800 },
+    ]);
+    expect(cs!.decodeTps).toEqual([
+      { t: 0, value: 200 },
+      { t: 1, value: 300 },
+    ]);
+  });
 });
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 3cb4181b..530600cf 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -17,8 +17,16 @@ import { parser } from 'stream-json';
 import { pick } from 'stream-json/filters/pick.js';
 import { streamObject } from 'stream-json/streamers/stream-object.js';
 
-/** Bump when the extraction algorithm changes — backfill recomputes anything older. */
-export const CHART_SERIES_VERSION = 1;
+/**
+ * Bump when the extraction algorithm changes — backfill recomputes anything
+ * older.
+ *
+ * v2: aggregate vllm gauges/counters across all engine series (was reading
+ * only series[0], which under-counted by Nx on multi-engine DP/PP
+ * deployments — most visible as a request-queue-depth chart that maxed out
+ * at ~3 when the timeline clearly showed 20+ in-flight).
+ */
+export const CHART_SERIES_VERSION = 2;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -147,17 +155,44 @@ export async function computeChartSeries(blob: Buffer | null): Promise<ChartSeri
   return buildSeriesFromMetrics(metrics);
 }
 
-/** Pull the first series under a metric key, or undefined. */
-function firstSeries(metrics: MetricsMap, name: string): RawSeries | undefined {
-  const s = metrics[name]?.series;
-  return s && s.length > 0 ? s[0] : undefined;
+/**
+ * Aggregate one timeslice field across all series of a metric, indexed by
+ * `start_ns`. Multi-engine vllm deployments report one series per engine —
+ * the cluster value is the sum (for running/waiting/throughput counters)
+ * or the average (for kv_cache_usage_perc, a per-engine fraction).
+ */
+function aggregateByStart(
+  series: readonly RawSeries[] | undefined,
+  field: 'avg' | 'rate',
+  combine: 'sum' | 'avg',
+): Map<number, number> {
+  const sums = new Map<number, number>();
+  const counts = new Map<number, number>();
+  for (const s of series ?? []) {
+    for (const ts of s.timeslices ?? []) {
+      if (typeof ts.start_ns !== 'number') continue;
+      const v = ts[field];
+      if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+      sums.set(ts.start_ns, (sums.get(ts.start_ns) ?? 0) + v);
+      counts.set(ts.start_ns, (counts.get(ts.start_ns) ?? 0) + 1);
+    }
+  }
+  if (combine === 'sum') return sums;
+  const out = new Map<number, number>();
+  for (const [t, s] of sums) out.set(t, s / (counts.get(t) ?? 1));
+  return out;
+}
+
+/** Stable order: emit one point per unique start_ns, chronologically. */
+function sortedEntries(m: Map<number, number>): [number, number][] {
+  return [...m.entries()].toSorted((a, b) => a[0] - b[0]);
 }
 
 function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   // Timing reference: smallest start_ns and largest end_ns across every
-  // timeslice we extracted. (Same logic as the original getTraceServerMetrics
-  // — looking at every metric gives the widest possible window even if some
-  // series start late.)
+  // timeslice we extracted. timeslicesCount is the length of any single
+  // series (engines are scraped on the same cadence), so picking the max
+  // length across all series of all metrics is safe.
   let startNs = Number.POSITIVE_INFINITY;
   let endNs = 0;
   let timeslicesCount = 0;
@@ -175,83 +210,70 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   if (!Number.isFinite(startNs)) startNs = 0;
   const tOf = (ns: number) => (ns - startNs) / 1e9;
 
-  // KV cache usage (gauge, 0..1)
-  const kvCacheUsage: TimeSeriesPoint[] = [];
+  // KV cache usage (gauge, 0..1) — average across engines so the value
+  // stays a fraction (each engine has its own KV pool).
   const kvSeries =
-    firstSeries(metrics, 'vllm:kv_cache_usage_perc') ??
-    firstSeries(metrics, 'vllm:gpu_cache_usage_perc');
-  for (const ts of kvSeries?.timeslices ?? []) {
-    if (typeof ts.avg === 'number' && typeof ts.start_ns === 'number') {
-      kvCacheUsage.push({ t: tOf(ts.start_ns), value: ts.avg });
-    }
-  }
+    metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series;
+  const kvCacheUsage: TimeSeriesPoint[] = sortedEntries(
+    aggregateByStart(kvSeries, 'avg', 'avg'),
+  ).map(([t, v]) => ({ t: tOf(t), value: v }));
 
-  // Prefix cache hit rate per scrape (Δhits / Δqueries from counter rate).
-  const hitsTs = firstSeries(metrics, 'vllm:prefix_cache_hits')?.timeslices ?? [];
-  const qsTs = firstSeries(metrics, 'vllm:prefix_cache_queries')?.timeslices ?? [];
+  // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across
+  // engines, joined on start_ns.
+  const hitsByT = aggregateByStart(metrics['vllm:prefix_cache_hits']?.series, 'rate', 'sum');
+  const qsByT = aggregateByStart(metrics['vllm:prefix_cache_queries']?.series, 'rate', 'sum');
   const prefixCacheHitRate: TimeSeriesPoint[] = [];
-  const minLen = Math.min(hitsTs.length, qsTs.length);
-  for (let i = 0; i < minLen; i++) {
-    const h = hitsTs[i]!;
-    const q = qsTs[i]!;
-    if (
-      typeof q.rate === 'number' &&
-      q.rate > 0 &&
-      typeof h.rate === 'number' &&
-      typeof h.start_ns === 'number'
-    ) {
-      prefixCacheHitRate.push({ t: tOf(h.start_ns), value: h.rate / q.rate });
-    }
+  for (const [t, h] of [...hitsByT.entries()].toSorted((a, b) => a[0] - b[0])) {
+    const q = qsByT.get(t);
+    if (q !== undefined && q > 0) prefixCacheHitRate.push({ t: tOf(t), value: h / q });
   }
 
-  // Queue depth: pair running + waiting by index.
-  const runTs = firstSeries(metrics, 'vllm:num_requests_running')?.timeslices ?? [];
-  const waitTs = firstSeries(metrics, 'vllm:num_requests_waiting')?.timeslices ?? [];
+  // Queue depth: sum running + waiting across engines per timeslice.
+  const runByT = aggregateByStart(metrics['vllm:num_requests_running']?.series, 'avg', 'sum');
+  const waitByT = aggregateByStart(metrics['vllm:num_requests_waiting']?.series, 'avg', 'sum');
   const queueDepth: QueueDepthPoint[] = [];
-  const qlen = Math.min(runTs.length, waitTs.length);
-  for (let i = 0; i < qlen; i++) {
-    const r = runTs[i]!;
-    const w = waitTs[i]!;
-    if (typeof r.start_ns !== 'number') continue;
-    const running = typeof r.avg === 'number' ? r.avg : 0;
-    const waiting = typeof w.avg === 'number' ? w.avg : 0;
-    queueDepth.push({
-      t: tOf(r.start_ns),
-      running,
-      waiting,
-      total: running + waiting,
-    });
+  // Union of timestamps so we surface activity even if one of the gauges
+  // didn't report a sample on a given tick.
+  const allTimes = new Set<number>([...runByT.keys(), ...waitByT.keys()]);
+  for (const t of [...allTimes].toSorted((a, b) => a - b)) {
+    const running = runByT.get(t) ?? 0;
+    const waiting = waitByT.get(t) ?? 0;
+    queueDepth.push({ t: tOf(t), running, waiting, total: running + waiting });
   }
 
-  // Throughput: extract counter `rate` (already per-second from aiperf).
-  const counterRate = (name: string): TimeSeriesPoint[] => {
-    const s = firstSeries(metrics, name);
-    if (!s) return [];
-    const out: TimeSeriesPoint[] = [];
-    for (const ts of s.timeslices ?? []) {
-      if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
-        out.push({ t: tOf(ts.start_ns), value: ts.rate });
-      }
-    }
-    return out;
-  };
+  // Throughput: sum the counter `rate` (already per-second) across engines.
+  const counterRate = (name: string): TimeSeriesPoint[] =>
+    sortedEntries(aggregateByStart(metrics[name]?.series, 'rate', 'sum')).map(([t, v]) => ({
+      t: tOf(t),
+      value: v,
+    }));
   const prefillTps = counterRate('vllm:prompt_tokens');
   const decodeTps = counterRate('vllm:generation_tokens');
 
-  // Per-source prompt tokens — emit one TS array per source label.
-  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
+  // Per-source prompt tokens — sum across engines per source label.
+  const promptBySrcByT = new Map<string, Map<number, number>>();
   for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
     const labels = series.labels ?? {};
     const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels);
-    const arr: TimeSeriesPoint[] = [];
+    let byT = promptBySrcByT.get(source);
+    if (!byT) {
+      byT = new Map<number, number>();
+      promptBySrcByT.set(source, byT);
+    }
     for (const ts of series.timeslices ?? []) {
       if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
-        arr.push({ t: tOf(ts.start_ns), value: ts.rate });
+        byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate);
       }
     }
+  }
+  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
+  for (const [source, byT] of promptBySrcByT) {
+    const arr: TimeSeriesPoint[] = [];
+    for (const [t, v] of [...byT.entries()].toSorted((a, b) => a[0] - b[0])) {
+      if (v > 0) arr.push({ t: tOf(t), value: v });
+    }
     if (arr.length > 0) promptTokensBySource[source] = arr;
   }
-
   return {
     version: CHART_SERIES_VERSION,
     startNs,
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
index 8ac4f678..1ad7fd7f 100644
--- a/packages/db/src/queries/agentic-aggregates.ts
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -29,8 +29,11 @@ import type { DbClient } from '../connection.js';
  * script recomputes any row whose stored `aggregate_stats.version` is older.
  * Lives here (rather than in compute-aggregate-stats.ts) to avoid a circular
  * import: the compute helper depends on the percentile utilities below.
+ *
+ * v2: aggregate vllm gauges/counters across all engine series (was reading
+ * only series[0], which under-counted by Nx on multi-engine DP/PP deployments).
  */
-export const STATS_VERSION = 1;
+export const STATS_VERSION = 2;
 
 export interface MetricPercentiles {
   mean: number;
@@ -154,10 +157,47 @@ interface MetricsJson {
   metrics?: Record<string, MetricMeta>;
 }
 
+/**
+ * Aggregate a per-timeslice field across all series of a metric, indexed by
+ * the timeslice's `start_ns`. vllm reports one series per engine on
+ * multi-engine DP/PP deployments, so we sum (or average) across engines to
+ * get the cluster-wide value at each timeslice.
+ *
+ * `field` selects which numeric field on a timeslice to read (`avg` for
+ * gauges, `rate` for counter deltas). `combine` controls cross-engine math:
+ * 'sum' for running/waiting/throughput counters where the cluster total is
+ * the sum; 'avg' for KV cache utilization, which is bounded [0, 1] per
+ * engine and should be averaged across engines for the cluster view.
+ */
+function aggregateSeriesByStart(
+  metricSeries: readonly Series[] | undefined,
+  field: 'avg' | 'rate',
+  combine: 'sum' | 'avg',
+): Map<number, number> {
+  const sums = new Map<number, number>();
+  const counts = new Map<number, number>();
+  for (const s of metricSeries ?? []) {
+    for (const ts of s.timeslices ?? []) {
+      if (typeof ts.start_ns !== 'number') continue;
+      const v = ts[field];
+      if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+      sums.set(ts.start_ns, (sums.get(ts.start_ns) ?? 0) + v);
+      counts.set(ts.start_ns, (counts.get(ts.start_ns) ?? 0) + 1);
+    }
+  }
+  if (combine === 'sum') return sums;
+  const out = new Map<number, number>();
+  for (const [t, s] of sums) out.set(t, s / (counts.get(t) ?? 1));
+  return out;
+}
+
 /**
  * Parse the server_metrics_json → time-series arrays for KV cache util and
  * prefix cache hit rate (per-interval, computed from the prometheus
  * counters the same way trace-server-metrics does it).
+ *
+ * Aggregates across all engine series so multi-engine DP/PP deployments are
+ * counted correctly (previously we only read engine 0).
  */
 export function extractServerMetricSamples(json: string): {
   kvCacheUtil: number[];
@@ -165,40 +205,26 @@ export function extractServerMetricSamples(json: string): {
 } {
   const parsed = JSON.parse(json) as MetricsJson;
   const metrics = parsed.metrics ?? {};
-  const firstSeries = (name: string): Series | undefined => {
-    const s = metrics[name]?.series;
-    return s && s.length > 0 ? s[0] : undefined;
-  };
 
-  // KV cache util — gauge in [0, 1].
-  const kvSeries =
-    firstSeries('vllm:kv_cache_usage_perc') ?? firstSeries('vllm:gpu_cache_usage_perc');
-  const kvCacheUtil: number[] = [];
-  for (const ts of kvSeries?.timeslices ?? []) {
-    if (typeof ts.avg === 'number') kvCacheUtil.push(ts.avg);
-  }
+  // KV cache util — per-engine gauge in [0, 1]. Average across engines so the
+  // value stays a percentage; summing would give meaningless 0..N.
+  const kvSeriesAll =
+    metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series;
+  const kvCacheUtil = [...aggregateSeriesByStart(kvSeriesAll, 'avg', 'avg').values()];
 
-  // Prefix cache hit rate per interval = hits.rate / queries.rate.
-  // Matches the derivation in queries/trace-server-metrics.ts.
-  // Metric names: vllm exposes these as `vllm:prefix_cache_*` (no `gpu_`
-  // prefix); falls back to the `gpu_`-prefixed names in case a future
-  // vllm version renames them.
+  // Prefix cache hit rate per interval = Σhits.rate / Σqueries.rate across
+  // all engines. Sum first, then divide.
+  const hitsAll =
+    metrics['vllm:prefix_cache_hits']?.series ?? metrics['vllm:gpu_prefix_cache_hits']?.series;
+  const queriesAll =
+    metrics['vllm:prefix_cache_queries']?.series ??
+    metrics['vllm:gpu_prefix_cache_queries']?.series;
+  const hitsByT = aggregateSeriesByStart(hitsAll, 'rate', 'sum');
+  const qByT = aggregateSeriesByStart(queriesAll, 'rate', 'sum');
   const prefixCacheHitRate: number[] = [];
-  const hitsSeries =
-    firstSeries('vllm:prefix_cache_hits') ?? firstSeries('vllm:gpu_prefix_cache_hits');
-  const queriesSeries =
-    firstSeries('vllm:prefix_cache_queries') ?? firstSeries('vllm:gpu_prefix_cache_queries');
-  if (hitsSeries && queriesSeries) {
-    const qByStart = new Map<number, TimeSlice>();
-    for (const q of queriesSeries.timeslices ?? []) {
-      if (typeof q.start_ns === 'number') qByStart.set(q.start_ns, q);
-    }
-    for (const h of hitsSeries.timeslices ?? []) {
-      if (typeof h.start_ns !== 'number' || typeof h.rate !== 'number') continue;
-      const q = qByStart.get(h.start_ns);
-      if (!q || typeof q.rate !== 'number' || q.rate === 0) continue;
-      prefixCacheHitRate.push(h.rate / q.rate);
-    }
+  for (const [t, h] of hitsByT) {
+    const q = qByT.get(t);
+    if (q !== undefined && q > 0) prefixCacheHitRate.push(h / q);
   }
 
   return { kvCacheUtil, prefixCacheHitRate };

From b3e315ccd66bfc5476fc7bf28b1b3c52628ffd8d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 26 May 2026 18:28:33 -0500
Subject: [PATCH 42/55] fix(scenario-selector): wrap "Deprecated" in
 SelectLabel + lead with agentic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two visual issues in the Scenario dropdown:
1. The "Deprecated" sub-header rendered as a bare span while sibling
   group labels ("Fixed Sequence Length") use SelectLabel — so
   "Deprecated" came out in body-text size, looking out of place.
2. Agentic Traces sat below the deprecated fixed-seq entries, visually
   implying it was part of the deprecated section.

Wraps DeprecatedSectionTitle in SelectLabel so the styling matches its
peers across all selectors (Scenario, Model, Hardware) that use it.
Moves the Agentic group to the top of the Scenario dropdown so it's
visually distinct from the fixed-seq + deprecated entries.

Agentic Traces was already the preferred default when available
(GlobalFilterContext.tsx); no behavior change there.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../app/src/components/ui/chart-selectors.tsx | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index 19b4bfb0..8b91059a 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -33,7 +33,7 @@ import {
 
 function DeprecatedSectionTitle({ reason }: { reason: string }) {
   return (
-    <span className="flex items-center gap-1">
+    <SelectLabel className="flex items-center gap-1">
       Deprecated
       <TooltipRoot>
         <TooltipTrigger asChild>
@@ -43,7 +43,7 @@ function DeprecatedSectionTitle({ reason }: { reason: string }) {
           <span>{reason}</span>
         </TooltipContent>
       </TooltipRoot>
-    </span>
+    </SelectLabel>
   );
 }
 
@@ -261,6 +261,17 @@ export function ScenarioSelector({
           <SelectValue />
         </SelectTrigger>
         <SelectContent>
+          {/* Agentic first — preferred default scenario when available. */}
+          {agentic.length > 0 && (
+            <SelectGroup>
+              <SelectLabel>Agentic</SelectLabel>
+              {agentic.map((seq) => (
+                <SelectItem key={seq} value={seq}>
+                  {getSequenceLabel(seq as Sequence)}
+                </SelectItem>
+              ))}
+            </SelectGroup>
+          )}
           {fixedSeq.length > 0 && (
             <SelectGroup>
               <SelectLabel>Fixed Sequence Length</SelectLabel>
@@ -281,11 +292,6 @@ export function ScenarioSelector({
               )}
             </SelectGroup>
           )}
-          {agentic.map((seq) => (
-            <SelectItem key={seq} value={seq}>
-              {getSequenceLabel(seq as Sequence)}
-            </SelectItem>
-          ))}
         </SelectContent>
       </Select>
     </div>

From 19b99586353cd39bccd4072bd6e2a2afcaf73367 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 26 May 2026 18:32:26 -0500
Subject: [PATCH 43/55] fix(scenario-selector): wrap Deprecated header in
 SelectLabel only inside Select
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous commit (b3e315c) changed DeprecatedSectionTitle to render
SelectLabel internally, which throws at runtime ("SelectLabel must be
used within SelectGroup") in callsites that render the header via
MultiSelect — MultiSelect wraps the header in its own div, not a Radix
SelectGroup.

Revert the component to a plain styled span (MultiSelect's div wrapper
supplies the small/muted styling), and wrap with SelectLabel only at
the ScenarioSelector callsite, where the header sits directly inside
a SelectGroup.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../app/src/components/ui/chart-selectors.tsx     | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index 8b91059a..49ea3f1a 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -31,9 +31,16 @@ import {
   sequenceKind,
 } from '@/lib/data-mappings';
 
+/**
+ * "Deprecated" sub-header used by selectors. Rendered as a span (not
+ * SelectLabel) because some callsites use `MultiSelect`, which wraps
+ * headers in its own div and isn't a SelectGroup. The span carries no
+ * styling of its own — the parent context supplies the muted/small
+ * treatment. ScenarioSelector renders this inside a SelectLabel directly.
+ */
 function DeprecatedSectionTitle({ reason }: { reason: string }) {
   return (
-    <SelectLabel className="flex items-center gap-1">
+    <span className="flex items-center gap-1">
       Deprecated
       <TooltipRoot>
         <TooltipTrigger asChild>
@@ -43,7 +50,7 @@ function DeprecatedSectionTitle({ reason }: { reason: string }) {
           <span>{reason}</span>
         </TooltipContent>
       </TooltipRoot>
-    </SelectLabel>
+    </span>
   );
 }
 
@@ -282,7 +289,9 @@ export function ScenarioSelector({
               ))}
               {fixedGroups.deprecated.length > 0 && (
                 <>
-                  <DeprecatedSectionTitle reason="CI capacity was reallocated to agentic coding and multi-turn chat scenarios." />
+                  <SelectLabel>
+                    <DeprecatedSectionTitle reason="CI capacity was reallocated to agentic coding and multi-turn chat scenarios." />
+                  </SelectLabel>
                   {fixedGroups.deprecated.map((seq) => (
                     <SelectItem key={seq} value={seq}>
                       {getSequenceLabel(seq as Sequence)}

From 7114833409b92a206f7c22b80846db527e01da43 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 13:22:13 -0500
Subject: [PATCH 44/55] feat(agentic-detail): add cumulative input tokens chart
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Surfaces a new chart on the agentic detail page showing the running
total of input (prompt) tokens served over the course of the run —
useful for seeing how the load actually accumulates vs the
instantaneous prefill_tps line we already plot.

Adds a `cumulativeSum` helper alongside the existing `cumulativeAverage`
and `sumSeries` time-series utilities. No backfill needed — the source
data (`chart_series.prefillTps`) is already pre-computed at ingest time
for every blob-bearing row.

(Input throughput as a Pareto axis is already wired via the existing
`y_inputTputPerGpu` y-axis option; no change there.)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    | 24 +++++++++++++++++++
 .../agentic-point/time-series-chart.tsx       | 17 +++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 2e43b4fb..1a61b93b 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -26,6 +26,7 @@ import {
   StackedAreaChart,
   TimeSeriesChart,
   cumulativeAverage,
+  cumulativeSum,
   rollingAverage,
   sumSeries,
 } from './time-series-chart';
@@ -381,6 +382,29 @@ export function AgenticPointDetail({ id }: Props) {
               );
             }}
           />
+
+          <ExpandableChart
+            title="Total input tokens over time"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'Cumulative input tokens',
+                      data: cumulativeSum(metrics.prefillTps),
+                      color: '#3b82f6',
+                      strokeWidth: 2,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yAxisLabel="Tokens"
+                  {...size}
+                />
+              );
+            }}
+          />
         </div>
       )}
     </div>
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index cd10aff7..042c4331 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -58,6 +58,23 @@ export function cumulativeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
   return out;
 }
 
+/**
+ * Running cumulative sum of a per-interval rate series. Each output point
+ * is the integral of the rate from start to that point, assuming the rate
+ * applies over a 1-second window (aiperf's scrape interval). Use for
+ * "total tokens served so far" from a tokens-per-second series.
+ */
+export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
+  if (data.length === 0) return data;
+  const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+  let sum = 0;
+  for (let i = 0; i < data.length; i++) {
+    sum += data[i]!.value;
+    out[i] = { t: data[i]!.t, value: sum };
+  }
+  return out;
+}
+
 /** Pointwise sum of two arrays sharing the same t index. */
 export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] {
   const n = Math.min(a.length, b.length);

From c6697de8ff3d8263924986fd71b4622f1369f9a3 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 14:44:19 -0500
Subject: [PATCH 45/55] feat(agentic-detail): plot cumulative unique input
 tokens
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the "Total input tokens over time" chart with "Total unique
input tokens over time" — cumsum of (prompt-token rate − prefix-cache-
hit rate per second), which equals the cumulative tokens vllm actually
had to prefill from scratch (= vllm:request_prefill_kv_computed_tokens).

Adds `prefixCacheHitsTps` to the chart_series JSONB (extracted by
summing vllm:prefix_cache_hits.rate across all engine series, same DP-
aware path as prefillTps). Bumps CHART_SERIES_VERSION to 3; the
existing trace-server-metrics query defaults the field to [] for any
older v2 rows so reads stay safe before backfill catches up.

Backfilled 62 rows to v3.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx       | 14 +++++++++++---
 .../src/hooks/api/use-trace-server-metrics.ts    |  2 ++
 packages/db/src/etl/compute-chart-series.ts      | 16 +++++++++++++++-
 packages/db/src/queries/trace-server-metrics.ts  |  4 ++++
 4 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 1a61b93b..4bebd37c 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -384,16 +384,24 @@ export function AgenticPointDetail({ id }: Props) {
           />
 
           <ExpandableChart
-            title="Total input tokens over time"
+            title="Total unique input tokens over time"
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
               if (!metrics) return <Skeleton />;
+              // Unique = total prompt tokens vllm received minus the tokens
+              // it served from the prefix cache. The cache-miss portion is
+              // what actually constitutes "new content" the GPU had to
+              // process — equivalent to cumsum of vllm:request_prefill_kv_computed_tokens.
+              const unique = sumSeries(
+                metrics.prefillTps,
+                metrics.prefixCacheHitsTps.map((p) => ({ t: p.t, value: -p.value })),
+              );
               return (
                 <TimeSeriesChart
                   series={[
                     {
-                      name: 'Cumulative input tokens',
-                      data: cumulativeSum(metrics.prefillTps),
+                      name: 'Cumulative unique input tokens',
+                      data: cumulativeSum(unique),
                       color: '#3b82f6',
                       strokeWidth: 2,
                     },
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
index 8418aa4f..664bc6c7 100644
--- a/packages/app/src/hooks/api/use-trace-server-metrics.ts
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -42,6 +42,8 @@ export interface TraceServerMetrics {
   promptTokensBySource: Record<string, TimeSeriesPoint[]>;
   prefillTps: TimeSeriesPoint[];
   decodeTps: TimeSeriesPoint[];
+  /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */
+  prefixCacheHitsTps: TimeSeriesPoint[];
 }
 
 async function fetchTraceServerMetrics(
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 530600cf..91e89521 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -25,8 +25,11 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  * only series[0], which under-counted by Nx on multi-engine DP/PP
  * deployments — most visible as a request-queue-depth chart that maxed out
  * at ~3 when the timeline clearly showed 20+ in-flight).
+ *
+ * v3: extract `prefixCacheHitsTps` so the detail page can derive cumulative
+ * unique input tokens as cumsum(prefillTps - prefixCacheHitsTps).
  */
-export const CHART_SERIES_VERSION = 2;
+export const CHART_SERIES_VERSION = 3;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -57,6 +60,13 @@ export interface ChartSeries {
   promptTokensBySource: Record<string, TimeSeriesPoint[]>;
   prefillTps: TimeSeriesPoint[];
   decodeTps: TimeSeriesPoint[];
+  /**
+   * Per-scrape rate (tokens/sec) of vllm:prefix_cache_hits, summed across
+   * engines. Detail page derives "cumulative unique input tokens" as
+   * cumsum(prefillTps - prefixCacheHitsTps) — what the cache actually
+   * saved vs the raw queries that came in.
+   */
+  prefixCacheHitsTps: TimeSeriesPoint[];
 }
 
 // ── Raw blob shapes (subset we read) ────────────────────────────────────
@@ -249,6 +259,9 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     }));
   const prefillTps = counterRate('vllm:prompt_tokens');
   const decodeTps = counterRate('vllm:generation_tokens');
+  // Tokens served from prefix cache per scrape. Lets the frontend derive
+  // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits).
+  const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits');
 
   // Per-source prompt tokens — sum across engines per source label.
   const promptBySrcByT = new Map<string, Map<number, number>>();
@@ -286,5 +299,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     promptTokensBySource,
     prefillTps,
     decodeTps,
+    prefixCacheHitsTps,
   };
 }
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
index 624b6ed3..76775e77 100644
--- a/packages/db/src/queries/trace-server-metrics.ts
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -71,6 +71,8 @@ export interface TraceServerMetrics {
   prefillTps: TimeSeriesPoint[];
   /** Decode throughput: vllm:generation_tokens rate (tokens/sec) per scrape. */
   decodeTps: TimeSeriesPoint[];
+  /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */
+  prefixCacheHitsTps: TimeSeriesPoint[];
 }
 
 interface RawMetaRow extends PointMeta {
@@ -114,6 +116,8 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics {
     promptTokensBySource: series.promptTokensBySource,
     prefillTps: series.prefillTps,
     decodeTps: series.decodeTps,
+    // v2 chart_series rows pre-backfill don't have this field — default to []
+    prefixCacheHitsTps: series.prefixCacheHitsTps ?? [],
   };
 }
 

From b5679bb10acfd6a6765b48a5864b2a0ec73d4915 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 15:00:12 -0500
Subject: [PATCH 46/55] feat(request-timeline): expandable subagent -> stream
 rows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The harness fans a single subagent into multiple parallel ":sN" streams
when its inner requests overlap in time (weka_trace._pack_into_streams).
Previously each :sN got its own swimlane row, which made one parent
conversation with 5 subagents (each fanned into 2-8 streams) render as
23 separate rows — visually implying 23 distinct subagent invocations
when really there are 5.

Now: each subagent shows as one row by default with a chevron + stream
count chip ("subagent 003 · f1e7 ×8"). The collapsed row draws the
union of all stream bars overlaid, so the concurrency burst is still
visible at a glance. Click the chevron to fan into per-stream rows;
click again to collapse.

For conv 0f5b266f in benchmark 206360: 23 rows → 5 rows by default.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/request-timeline.tsx        | 325 ++++++++++++------
 1 file changed, 226 insertions(+), 99 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index bcbe105a..8762a158 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -53,44 +53,84 @@ const PHASE_COLORS: Record<string, string> = {
   unknown: '#64748b',
 };
 
+/**
+ * Row kinds:
+ *   parent           — top-level conversation (depth 0)
+ *   worker           — worker swimlane (depth 0, worker mode)
+ *   subagent         — a subagent invocation (depth 1). Either a single
+ *                      stream (renders its own bars), or a multi-stream
+ *                      container whose bars are the union of its streams
+ *                      when collapsed.
+ *   stream           — one :sN stream of a multi-stream subagent (depth 2).
+ *                      Hidden by default; toggled in via the parent's chevron.
+ */
+type RowKind = 'parent' | 'worker' | 'subagent' | 'stream';
+
 interface Row {
   key: string;
   label: string;
   color: string;
   requests: RequestRecord[];
-  /** 0 = top-level conversation/worker, 1+ = sub-agent under its parent. */
   depth: number;
-  /** True if this row is a sub-agent ("Subagent N of parent X"). */
-  isSubagent: boolean;
+  kind: RowKind;
+  /** Number of streams under this subagent (>=1). Only set for subagent rows. */
+  streamCount?: number;
+  /** For stream rows: the parent subagent's row key (drives expand/collapse). */
+  parentRowKey?: string;
 }
 
 /**
  * Conversation ids for subagent calls look like
- *   <parent_cid>::sa:subagent_<N>_<hash>
- * Split into the parent cid and a sub-agent label (or the whole thing if
- * this is a top-level conversation).
+ *   <parent_cid>::sa:<agent_id>[:s<stream_idx>]
+ * The optional `:s<N>` suffix is set when the harness fans a single
+ * subagent into multiple parallel "streams" (interval-graph
+ * decomposition in weka_trace._pack_into_streams). We split it off so
+ * we can group all streams of one subagent under a single header row.
  */
-function splitCid(cid: string): { parent: string; subagent: string | null } {
+function splitCid(cid: string): {
+  parent: string;
+  subagentBase: string | null;
+  stream: number | null;
+} {
   const sep = cid.indexOf('::sa:');
-  if (sep === -1) return { parent: cid, subagent: null };
-  return { parent: cid.slice(0, sep), subagent: cid.slice(sep + 5) };
+  if (sep === -1) return { parent: cid, subagentBase: null, stream: null };
+  const parent = cid.slice(0, sep);
+  const raw = cid.slice(sep + 5);
+  const m = /^(.*):s(\d+)$/.exec(raw);
+  if (m) return { parent, subagentBase: m[1]!, stream: Number(m[2]) };
+  return { parent, subagentBase: raw, stream: null };
 }
 
-/** Group requests into rows; in conversation mode subagents nest under parents. */
-function buildRows(requests: RequestRecord[], mode: RowMode): Row[] {
-  const groups = new Map<string, RequestRecord[]>();
-  for (const r of requests) {
-    const key = mode === 'conversation' ? r.cid : r.wid;
-    let list = groups.get(key);
-    if (!list) {
-      list = [];
-      groups.set(key, list);
-    }
-    list.push(r);
-  }
-
+/**
+ * Group requests into rows. In conversation mode, output order is:
+ *   parent_conv
+ *     subagent_001                  (collapsed by default, container)
+ *       :s0                         (hidden unless expanded)
+ *       :s1
+ *     subagent_002
+ *     ...
+ *
+ * `expandedSubagents` controls which subagent containers reveal their
+ * stream children. Bars on a collapsed subagent are the UNION of all its
+ * streams' requests — overlapping bars visually communicate the
+ * stream-level parallelism without expanding.
+ */
+function buildRows(
+  requests: RequestRecord[],
+  mode: RowMode,
+  expandedSubagents: ReadonlySet<string>,
+): Row[] {
   if (mode !== 'conversation') {
     // Worker mode: flat rows, sorted by first activity.
+    const groups = new Map<string, RequestRecord[]>();
+    for (const r of requests) {
+      let list = groups.get(r.wid);
+      if (!list) {
+        list = [];
+        groups.set(r.wid, list);
+      }
+      list.push(r);
+    }
     const rows: Row[] = [];
     let i = 0;
     for (const [key, list] of groups) {
@@ -101,7 +141,7 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] {
         color: ROW_COLORS[i % ROW_COLORS.length]!,
         requests: list,
         depth: 0,
-        isSubagent: false,
+        kind: 'worker',
       });
       i++;
     }
@@ -109,36 +149,40 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] {
     return rows;
   }
 
-  // Conversation mode: build a parent → [subagents] tree so each parent
-  // group renders as one parent row followed by its sub-agent rows. Color
-  // is shared inside a tree so the visual grouping reads.
+  // Conversation mode — tree: parent → subagent → stream.
   interface Tree {
     parentCid: string;
-    parentRow: { key: string; requests: RequestRecord[] } | null;
-    subagents: Map<string, RequestRecord[]>; // subagent label → requests
+    parentReqs: RequestRecord[];
+    // subagentBase → (streamIndex|null → requests)
+    subagents: Map<string, Map<number | null, RequestRecord[]>>;
     firstStart: number;
   }
   const trees = new Map<string, Tree>();
-  for (const [cid, list] of groups) {
-    list.sort((a, b) => a.start - b.start);
-    const { parent, subagent } = splitCid(cid);
+  for (const r of requests) {
+    const { parent, subagentBase, stream } = splitCid(r.cid);
     let tree = trees.get(parent);
     if (!tree) {
       tree = {
         parentCid: parent,
-        parentRow: null,
+        parentReqs: [],
         subagents: new Map(),
         firstStart: Number.POSITIVE_INFINITY,
       };
       trees.set(parent, tree);
     }
-    if (subagent === null) {
-      tree.parentRow = { key: cid, requests: list };
+    if (subagentBase === null) {
+      tree.parentReqs.push(r);
     } else {
-      tree.subagents.set(subagent, list);
+      let saMap = tree.subagents.get(subagentBase);
+      if (!saMap) {
+        saMap = new Map();
+        tree.subagents.set(subagentBase, saMap);
+      }
+      const list = saMap.get(stream);
+      if (list) list.push(r);
+      else saMap.set(stream, [r]);
     }
-    const earliest = list[0]!.start;
-    if (earliest < tree.firstStart) tree.firstStart = earliest;
+    if (r.start < tree.firstStart) tree.firstStart = r.start;
   }
 
   const sortedTrees = [...trees.values()].toSorted((a, b) => a.firstStart - b.firstStart);
@@ -147,39 +191,66 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] {
   for (const tree of sortedTrees) {
     const color = ROW_COLORS[colorIdx % ROW_COLORS.length]!;
     colorIdx++;
-    if (tree.parentRow) {
+    // Parent row (use a placeholder key if the parent itself wasn't replayed).
+    tree.parentReqs.sort((a, b) => a.start - b.start);
+    rows.push({
+      key: tree.parentReqs.length > 0 ? tree.parentCid : `__parent_${tree.parentCid}`,
+      label: tree.parentCid,
+      color,
+      requests: tree.parentReqs,
+      depth: 0,
+      kind: 'parent',
+    });
+
+    // One subagent row per base (which may contain N streams).
+    const subagentEntries = [...tree.subagents.entries()].toSorted((a, b) => {
+      const aStart = Math.min(
+        ...[...a[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY),
+      );
+      const bStart = Math.min(
+        ...[...b[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY),
+      );
+      return aStart - bStart;
+    });
+    for (const [saBase, streams] of subagentEntries) {
+      const subagentKey = `${tree.parentCid}::sa:${saBase}`;
+      // Union of all stream requests for collapsed-view bars.
+      const allReqs: RequestRecord[] = [];
+      for (const reqs of streams.values()) allReqs.push(...reqs);
+      allReqs.sort((a, b) => a.start - b.start);
+      const streamCount = streams.size;
       rows.push({
-        key: tree.parentRow.key,
-        label: shortenCid(tree.parentCid),
+        key: subagentKey,
+        label: `↳ ${formatSubagentLabel(saBase)}`,
         color,
-        requests: tree.parentRow.requests,
-        depth: 0,
-        isSubagent: false,
-      });
-    } else {
-      // Pseudo-parent header so orphan subagents still render under
-      // something they belong to.
-      rows.push({
-        key: `__parent_${tree.parentCid}`,
-        label: shortenCid(tree.parentCid),
-        color,
-        requests: [],
-        depth: 0,
-        isSubagent: false,
-      });
-    }
-    const subagentEntries = [...tree.subagents.entries()].toSorted(
-      (a, b) => a[1][0]!.start - b[1][0]!.start,
-    );
-    for (const [saLabel, list] of subagentEntries) {
-      rows.push({
-        key: `${tree.parentCid}::${saLabel}`,
-        label: `↳ ${formatSubagentLabel(saLabel)}`,
-        color,
-        requests: list,
+        requests: allReqs,
         depth: 1,
-        isSubagent: true,
+        kind: 'subagent',
+        streamCount,
       });
+
+      // Stream children only when expanded AND there's more than one
+      // stream (a single-stream subagent has nothing extra to show).
+      if (streamCount > 1 && expandedSubagents.has(subagentKey)) {
+        const streamEntries = [...streams.entries()].toSorted((a, b) => {
+          // Sort by stream index (null first as the "default" stream)
+          const ai = a[0] ?? -1;
+          const bi = b[0] ?? -1;
+          return ai - bi;
+        });
+        for (const [streamIdx, reqs] of streamEntries) {
+          reqs.sort((a, b) => a.start - b.start);
+          rows.push({
+            key: `${subagentKey}:s${streamIdx ?? '∅'}`,
+            label: `stream ${streamIdx ?? '∅'}`,
+            color,
+            requests: reqs,
+            depth: 2,
+            kind: 'stream',
+            parentRowKey: subagentKey,
+          });
+        }
+      }
     }
   }
   return rows;
@@ -192,11 +263,6 @@ function formatSubagentLabel(raw: string): string {
   return `subagent ${m[1]} · ${m[2]!.slice(0, 4)}`;
 }
 
-function shortenCid(cid: string): string {
-  if (cid.length <= 12) return cid;
-  return `${cid.slice(0, 8)}…${cid.slice(-4)}`;
-}
-
 function shortenWid(wid: string): string {
   // worker_4ae87bea → w_4ae8
   return wid.replace(/^worker_/, 'w_').slice(0, 12);
@@ -314,6 +380,17 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
   const [rowMode, setRowMode] = useState<RowMode>('conversation');
   const [phaseFilter, setPhaseFilter] = useState<PhaseFilter>('profiling');
   const [tooltip, setTooltip] = useState<TooltipData | null>(null);
+  // Which multi-stream subagents currently have their per-stream rows
+  // expanded. Key is the subagent row's `key` (parent_cid::sa:agent_id).
+  const [expandedSubagents, setExpandedSubagents] = useState<ReadonlySet<string>>(() => new Set());
+  const toggleSubagent = useCallback((key: string) => {
+    setExpandedSubagents((prev) => {
+      const next = new Set(prev);
+      if (next.has(key)) next.delete(key);
+      else next.add(key);
+      return next;
+    });
+  }, []);
   const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null);
 
   // Apply phase filter, then group into rows.
@@ -322,7 +399,10 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
       phaseFilter === 'all' ? data.requests : data.requests.filter((r) => r.phase === 'profiling'),
     [data.requests, phaseFilter],
   );
-  const rows = useMemo(() => buildRows(filtered, rowMode), [filtered, rowMode]);
+  const rows = useMemo(
+    () => buildRows(filtered, rowMode, expandedSubagents),
+    [filtered, rowMode, expandedSubagents],
+  );
 
   // Pre-sort the timestamp columns so the cursor-time stats popover can
   // count "running / waiting at time t" in O(log n). With a few hundred
@@ -359,7 +439,10 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
   const isZoomed = viewEnd !== null;
 
   // Layout
-  const LABEL_WIDTH = 160;
+  // Wide enough for a full 36-char conversation id at 10px font, plus the
+  // indent + color stripe + count badge. Subagent rows inherit the same
+  // width but truncate the longer "↳ subagent N · hash" tail with ellipsis.
+  const LABEL_WIDTH = 360;
   const ROW_HEIGHT = 22;
   const ROW_GAP = 3;
   const HEADER_HEIGHT = 24;
@@ -537,33 +620,58 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
                 {rowMode === 'conversation' ? 'Conversation' : 'Worker'}
               </span>
             </div>
-            {rows.map((row) => (
-              <div
-                key={row.key}
-                className="flex items-center gap-1.5 overflow-hidden pr-2"
-                style={{
-                  height: ROW_HEIGHT + ROW_GAP,
-                  paddingLeft: 8 + row.depth * 12,
-                }}
-              >
-                <span
-                  className="inline-block w-1 h-3 rounded-sm flex-shrink-0"
+            {rows.map((row) => {
+              const isSubagentRow = row.kind === 'subagent';
+              const isStreamRow = row.kind === 'stream';
+              const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1;
+              const isExpanded = isExpandable && expandedSubagents.has(row.key);
+              return (
+                <div
+                  key={row.key}
+                  className="flex items-center gap-1 overflow-hidden pr-2"
                   style={{
-                    backgroundColor: row.color,
-                    opacity: row.isSubagent ? 0.55 : 1,
+                    height: ROW_HEIGHT + ROW_GAP,
+                    paddingLeft: 4 + row.depth * 10,
                   }}
-                />
-                <span
-                  className="text-[10px] font-mono truncate"
-                  style={{ color: row.color, opacity: row.isSubagent ? 0.85 : 1 }}
                 >
-                  {row.label}
-                </span>
-                <span className="text-[9px] font-mono text-muted-foreground ml-auto shrink-0">
-                  {row.requests.length > 0 ? row.requests.length : '—'}
-                </span>
-              </div>
-            ))}
+                  {isExpandable ? (
+                    <button
+                      type="button"
+                      onClick={() => toggleSubagent(row.key)}
+                      className="size-3.5 flex items-center justify-center text-muted-foreground hover:text-foreground shrink-0"
+                      aria-label={isExpanded ? 'Collapse streams' : 'Expand streams'}
+                      title={isExpanded ? 'Collapse streams' : 'Expand streams'}
+                    >
+                      <span className="text-[10px] leading-none">{isExpanded ? '▾' : '▸'}</span>
+                    </button>
+                  ) : (
+                    <span className="size-3.5 shrink-0" />
+                  )}
+                  <span
+                    className="inline-block w-1 h-3 rounded-sm flex-shrink-0"
+                    style={{
+                      backgroundColor: row.color,
+                      opacity: isStreamRow ? 0.4 : isSubagentRow ? 0.55 : 1,
+                    }}
+                  />
+                  <span
+                    className="text-[10px] font-mono truncate"
+                    style={{
+                      color: row.color,
+                      opacity: isStreamRow ? 0.7 : isSubagentRow ? 0.85 : 1,
+                    }}
+                  >
+                    {row.label}
+                    {isExpandable && (
+                      <span className="text-muted-foreground ml-1">×{row.streamCount}</span>
+                    )}
+                  </span>
+                  <span className="text-[9px] font-mono text-muted-foreground ml-auto shrink-0">
+                    {row.requests.length > 0 ? row.requests.length : '—'}
+                  </span>
+                </div>
+              );
+            })}
           </div>
 
           {/* Scrollable SVG */}
@@ -636,6 +744,16 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
               {rows.map((row, rowIdx) => {
                 const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2;
                 const barH = ROW_HEIGHT - 4;
+                // For multi-stream subagent containers, suppress the union
+                // bars when expanded — the child stream rows draw them
+                // individually instead, so we'd double-draw otherwise.
+                if (
+                  row.kind === 'subagent' &&
+                  (row.streamCount ?? 1) > 1 &&
+                  expandedSubagents.has(row.key)
+                ) {
+                  return null;
+                }
                 return row.requests.map((req) => {
                   const xCredit = xOf(req.credit);
                   const xStart = xOf(req.start);
@@ -663,7 +781,8 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
                           opacity={0.35}
                         />
                       )}
-                      {/* Main bar */}
+                      {/* Main bar — opacity stepped down with depth so
+                          parent > subagent > stream reads visually. */}
                       <rect
                         x={xStart}
                         y={yTop}
@@ -671,7 +790,15 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
                         height={barH}
                         rx={2}
                         fill={row.color}
-                        opacity={req.cancelled ? 0.35 : row.isSubagent ? 0.6 : 0.85}
+                        opacity={
+                          req.cancelled
+                            ? 0.35
+                            : row.kind === 'stream'
+                              ? 0.5
+                              : row.kind === 'subagent'
+                                ? 0.6
+                                : 0.85
+                        }
                       />
                       {/* Phase strip at bottom */}
                       <rect

From 2e1f1ce33da85dbc8058bf41feffffc04ba7ee26 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 15:07:27 -0500
Subject: [PATCH 47/55] fix(agentic-detail): make unique-input-tokens chart
 monotonic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

vllm's per-scrape prompt_tokens.rate and prefix_cache_hits.rate counters
can lag each other by several seconds across scrapes (we see prefill=0
at one tick with hits=1.1M, then prefill=1.5M with hits=452K six ticks
later — lifetime totals agree but per-tick they don't). Computing
cumsum(prefill - hits) per tick made the chart dip well negative at
the start.

Replaces the per-tick subtraction with `cumulativeDifferenceMonotonic`:
union the two series by timestamp, accumulate each independently, take
the diff, then enforce a running max so the curve never decreases.
End-of-run totals are unchanged (both counters converge to the right
value); transient skew just looks like a brief plateau instead of a
negative dip.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    | 21 ++++++-----
 .../agentic-point/time-series-chart.tsx       | 37 +++++++++++++++++++
 2 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 4bebd37c..1abf64e6 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -26,7 +26,7 @@ import {
   StackedAreaChart,
   TimeSeriesChart,
   cumulativeAverage,
-  cumulativeSum,
+  cumulativeDifferenceMonotonic,
   rollingAverage,
   sumSeries,
 } from './time-series-chart';
@@ -388,20 +388,21 @@ export function AgenticPointDetail({ id }: Props) {
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
               if (!metrics) return <Skeleton />;
-              // Unique = total prompt tokens vllm received minus the tokens
-              // it served from the prefix cache. The cache-miss portion is
-              // what actually constitutes "new content" the GPU had to
-              // process — equivalent to cumsum of vllm:request_prefill_kv_computed_tokens.
-              const unique = sumSeries(
-                metrics.prefillTps,
-                metrics.prefixCacheHitsTps.map((p) => ({ t: p.t, value: -p.value })),
-              );
+              // Unique = total prompt tokens received minus tokens served
+              // from the prefix cache. Equivalent to cumsum of
+              // vllm:request_prefill_kv_computed_tokens. We compute it as
+              // monotonic-non-decreasing cumulative-diff so per-scrape
+              // timing skew between the prompt_tokens and prefix_cache_hits
+              // counters can't make the line dip negative.
               return (
                 <TimeSeriesChart
                   series={[
                     {
                       name: 'Cumulative unique input tokens',
-                      data: cumulativeSum(unique),
+                      data: cumulativeDifferenceMonotonic(
+                        metrics.prefillTps,
+                        metrics.prefixCacheHitsTps,
+                      ),
                       color: '#3b82f6',
                       strokeWidth: 2,
                     },
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 042c4331..25d5a672 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -75,6 +75,43 @@ export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
   return out;
 }
 
+/**
+ * Monotonic-non-decreasing cumulative difference of two rate series:
+ * for each unique timestamp, compute Σa[0..t] − Σb[0..t], then enforce
+ * a running max so the curve never dips below its prior value.
+ *
+ * Use this to plot things like "cumulative cache-missed tokens" where the
+ * true value can only ever grow, but the underlying per-tick rates can
+ * temporarily look negative due to counter timing skew between scrapes
+ * (vllm's `prefix_cache_hits` and `prompt_tokens` counters can lag each
+ * other by ~5-10 s in our data even though their lifetime totals agree).
+ *
+ * `a` and `b` may have different (or overlapping) timestamp sets — both
+ * are unioned and walked in time order. Output has one point per unique
+ * timestamp present in either input.
+ */
+export function cumulativeDifferenceMonotonic(
+  a: TimeSeriesPoint[],
+  b: TimeSeriesPoint[],
+): TimeSeriesPoint[] {
+  const aByT = new Map(a.map((p) => [p.t, p.value]));
+  const bByT = new Map(b.map((p) => [p.t, p.value]));
+  const allT = [...new Set([...aByT.keys(), ...bByT.keys()])].toSorted((x, y) => x - y);
+  const out: TimeSeriesPoint[] = Array.from({ length: allT.length });
+  let cumA = 0;
+  let cumB = 0;
+  let runningMax = 0;
+  for (let i = 0; i < allT.length; i++) {
+    const t = allT[i]!;
+    cumA += aByT.get(t) ?? 0;
+    cumB += bByT.get(t) ?? 0;
+    const diff = cumA - cumB;
+    if (diff > runningMax) runningMax = diff;
+    out[i] = { t, value: runningMax };
+  }
+  return out;
+}
+
 /** Pointwise sum of two arrays sharing the same t index. */
 export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] {
   const n = Math.min(a.length, b.length);

From 08bbe6650c73935d7ac7a9fa29a722b141911bc9 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 15:15:05 -0500
Subject: [PATCH 48/55] feat(agentic-detail): add unique input tokens in flight
 chart

New chart on the per-point view that plots the deduped count of
input tokens currently held by in-flight requests, as a 30s time-
weighted rolling average with the raw step series rendered as faint
scatter behind it. Useful for seeing the working set the model has
to hold KV cache for at any instant.

Computation (frontend, from request_timeline):
  - At each request start/end event, maintain active ISL per cid
    (within one cid turns are sequential, so each cid contributes
    at most one in-flight ISL at a time)
  - total_in_flight(t) = sum over cids with active request of that
    cid's current ISL
  - Across cids we treat content as independent (cross-conv prefix
    sharing measured at <1 pp, so summing is a tight approximation)

Adds timeRollingAverage helper: time-weighted (vs sample-count)
moving average suitable for irregularly-sampled event series like
this one.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    | 43 ++++++++-
 .../agentic-point/time-series-chart.tsx       | 96 +++++++++++++++++++
 2 files changed, 137 insertions(+), 2 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 1abf64e6..2db2809b 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -27,8 +27,10 @@ import {
   TimeSeriesChart,
   cumulativeAverage,
   cumulativeDifferenceMonotonic,
+  inflightUniqueTokens,
   rollingAverage,
   sumSeries,
+  timeRollingAverage,
 } from './time-series-chart';
 
 interface Props {
@@ -124,8 +126,10 @@ export function AgenticPointDetail({ id }: Props) {
   // shows how the metric varies across the SKU.
   const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? [];
   const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates');
-  // Per-request timeline fetched only when the timeline view is active.
-  const timelineQuery = useRequestTimeline(id, view === 'timeline');
+  // Per-request timeline used by both the timeline view AND the per-point
+  // "Unique input tokens in flight" chart, so fetch whenever we're on
+  // either view.
+  const timelineQuery = useRequestTimeline(id, view === 'timeline' || view === 'point');
 
   return (
     <div className="container mx-auto px-4 lg:px-8 flex flex-col gap-4 py-6">
@@ -414,6 +418,41 @@ export function AgenticPointDetail({ id }: Props) {
               );
             }}
           />
+
+          <ExpandableChart
+            title="Unique input tokens in flight"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!timelineQuery.data) {
+                return timelineQuery.isLoading ? <Skeleton /> : <Empty />;
+              }
+              // Step function: at each request start/end, sum the ISLs of
+              // currently-active requests across distinct cids. Within one
+              // cid turns are sequential so each cid contributes at most
+              // one in-flight ISL; across cids we treat content as
+              // independent (cross-conv prefix sharing adds <1pp in
+              // practice). Smooth with a 30s time-weighted rolling average
+              // so brief turn-handoff dips don't dominate the chart.
+              const raw = inflightUniqueTokens(timelineQuery.data.requests);
+              const smoothed = timeRollingAverage(raw, 30);
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'In flight (avg 30s)',
+                      data: smoothed,
+                      rawData: raw,
+                      color: '#a855f7',
+                      strokeWidth: 2,
+                    },
+                  ]}
+                  durationS={timelineQuery.data.durationS}
+                  yAxisLabel="Tokens"
+                  {...size}
+                />
+              );
+            }}
+          />
         </div>
       )}
     </div>
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 25d5a672..520b3ed6 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -27,6 +27,39 @@ interface TimeSeriesChartProps {
   height?: number;
 }
 
+/**
+ * Time-weighted rolling average over a `windowS`-second trailing window.
+ * Treats the input as a step function (value held constant between
+ * samples) and integrates over the trailing window, dividing by the
+ * window length. Good for smoothing irregularly-sampled event series
+ * (e.g. request start/end events) where the regular sample-count
+ * `rollingAverage` would over-weight bursts of close-together events.
+ */
+export function timeRollingAverage(data: TimeSeriesPoint[], windowS: number): TimeSeriesPoint[] {
+  if (data.length === 0 || windowS <= 0) return data;
+  const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+  for (let i = 0; i < data.length; i++) {
+    const tEnd = data[i]!.t;
+    const tStart = Math.max(0, tEnd - windowS);
+    // Find the first sample j whose t is >= tStart; the step value at
+    // tStart is data[j-1].value if j > 0, else data[0].value.
+    let j = 0;
+    while (j < data.length && data[j]!.t < tStart) j++;
+    let prevT = tStart;
+    let prevV = j > 0 ? data[j - 1]!.value : data[0]!.value;
+    let area = 0;
+    for (; j <= i; j++) {
+      const curT = data[j]!.t;
+      area += prevV * (curT - prevT);
+      prevT = curT;
+      prevV = data[j]!.value;
+    }
+    const dur = tEnd - tStart;
+    out[i] = { t: tEnd, value: dur > 0 ? area / dur : data[i]!.value };
+  }
+  return out;
+}
+
 /** Centered rolling average over `windowSize` samples. */
 export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): TimeSeriesPoint[] {
   if (data.length === 0 || windowSize <= 1) return data;
@@ -75,6 +108,69 @@ export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
   return out;
 }
 
+/**
+ * Per-event step series: at each request start/end, sum the ISLs of
+ * currently-active requests across distinct `cid`s. Within a single
+ * `cid` aiperf dispatches turns sequentially (turn N+1 waits for N),
+ * so each cid contributes at most one in-flight ISL at a time. Across
+ * different cids we assume content is independent (parent ↔ subagent
+ * and conv ↔ conv share negligible prefix in practice — cross-conv
+ * dedup added ~0.25 pp to theoretical hit rate, so treating them as
+ * independent is a tight approximation of the true in-flight unique
+ * token count).
+ *
+ * Output is a step function: one point per event, value held constant
+ * until the next event. Time axis is seconds relative to the earliest
+ * event in `requests`.
+ */
+export function inflightUniqueTokens(
+  requests: readonly { cid: string; start: number; end: number; isl: number | null }[],
+): TimeSeriesPoint[] {
+  if (requests.length === 0) return [];
+  // The request_timeline timestamps are ns-relative to its own origin.
+  // Convert events to seconds and emit a step series.
+  interface Event {
+    tNs: number;
+    kind: 'start' | 'end';
+    cid: string;
+    isl: number;
+  }
+  const events: Event[] = [];
+  for (const r of requests) {
+    const isl = r.isl ?? 0;
+    if (isl <= 0) continue;
+    events.push({ tNs: r.start, kind: 'start', cid: r.cid, isl });
+    events.push({ tNs: r.end, kind: 'end', cid: r.cid, isl });
+  }
+  if (events.length === 0) return [];
+  // Sort by time; on ties, process 'end' before 'start' so a same-instant
+  // turn handoff within one cid doesn't transiently double-count.
+  events.sort((a, b) => a.tNs - b.tNs || (a.kind === 'end' ? -1 : 1));
+
+  // Active ISL per cid (max in case the same cid somehow has overlapping
+  // events; in practice it's always 0 or 1 request at a time per cid).
+  const activeByCid = new Map<string, number>();
+  let total = 0;
+  const out: TimeSeriesPoint[] = [{ t: 0, value: 0 }];
+  for (const e of events) {
+    const tSec = e.tNs / 1e9;
+    if (e.kind === 'start') {
+      const prev = activeByCid.get(e.cid) ?? 0;
+      const next = Math.max(prev, e.isl);
+      activeByCid.set(e.cid, next);
+      total += next - prev;
+    } else {
+      const cur = activeByCid.get(e.cid) ?? 0;
+      if (cur > 0) {
+        total -= cur;
+        activeByCid.delete(e.cid);
+      }
+    }
+    out.push({ t: tSec, value: Math.max(0, total) });
+  }
+  return out;
+}
+
 /**
  * Monotonic-non-decreasing cumulative difference of two rate series:
  * for each unique timestamp, compute Σa[0..t] − Σb[0..t], then enforce

From 7561deb1cc5a210ce6cd074ab0d4771b3b9f8342 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 20:30:39 -0500
Subject: [PATCH 49/55] feat(chart-series): extract SGLang metrics alongside
 vllm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Our chart_series + aggregate_stats extractors hardcoded vllm:* metric
names, so SGLang runs (e.g. qwen3.5/h100/sglang) ingested cleanly but
the per-point detail page rendered empty charts — chart_series fields
were all zero-length arrays.

Adds fallback chains in each extractor:

  KV cache util      vllm:kv_cache_usage_perc  → sglang:token_usage
  Prefix cache hits  vllm:prefix_cache_hits    → sglang:cached_tokens
  Prefix cache qrys  vllm:prefix_cache_queries → sglang:prompt_tokens
  Requests running   vllm:num_requests_running → sglang:num_running_reqs
  Requests waiting   vllm:num_requests_waiting → sglang:num_queue_reqs
  Prompt tokens rate vllm:prompt_tokens        → sglang:prompt_tokens
  Generation rate    vllm:generation_tokens    → sglang:generation_tokens

The `pickFirstNonEmpty` helper walks the chain and uses whichever
series has data, so a future framework (mori-sglang, dynamo, etc.) can
plug in by adding its names to each chain — no per-framework branching.

CHART_SERIES_VERSION → 4, STATS_VERSION → 3. Both backfills re-ran (86
chart_series rows, 190 aggregate_stats rows). SGLang chart_series for
qwen3.5 run 944 verified — was 0-length arrays before, now ~1800
samples each.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/compute-chart-series.ts   | 67 +++++++++++++++----
 packages/db/src/queries/agentic-aggregates.ts | 56 +++++++++++++---
 2 files changed, 98 insertions(+), 25 deletions(-)

diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 91e89521..86b79925 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -28,8 +28,11 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  *
  * v3: extract `prefixCacheHitsTps` so the detail page can derive cumulative
  * unique input tokens as cumsum(prefillTps - prefixCacheHitsTps).
+ *
+ * v4: extract sglang:* metrics too (fallback chain in each picker), so
+ * SGLang runs populate the chart_series the same way vllm runs do.
  */
-export const CHART_SERIES_VERSION = 3;
+export const CHART_SERIES_VERSION = 4;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -89,8 +92,13 @@ interface RawMetric {
 
 type MetricsMap = Record<string, RawMetric>;
 
-/** The set of metric subtrees the chart consumes. */
+/**
+ * The set of metric subtrees the chart consumes. Includes both vllm:* and
+ * sglang:* names so the stream-parse fallback collects whichever framework
+ * the blob was emitted by — `buildSeriesFromMetrics` then picks per metric.
+ */
 const CHART_METRIC_KEYS = new Set([
+  // vLLM
   'vllm:kv_cache_usage_perc',
   'vllm:gpu_cache_usage_perc',
   'vllm:prefix_cache_hits',
@@ -100,6 +108,13 @@ const CHART_METRIC_KEYS = new Set([
   'vllm:prompt_tokens',
   'vllm:generation_tokens',
   'vllm:prompt_tokens_by_source',
+  // SGLang
+  'sglang:token_usage',
+  'sglang:cached_tokens',
+  'sglang:prompt_tokens',
+  'sglang:generation_tokens',
+  'sglang:num_running_reqs',
+  'sglang:num_queue_reqs',
 ]);
 
 /**
@@ -220,18 +235,37 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   if (!Number.isFinite(startNs)) startNs = 0;
   const tOf = (ns: number) => (ns - startNs) / 1e9;
 
+  // Pick the first metric name whose series array has any data; fallback
+  // chain lets the same code path serve both vllm:* and sglang:* blobs.
+  const pickSeries = (...names: string[]): readonly RawSeries[] | undefined => {
+    for (const name of names) {
+      const s = metrics[name]?.series;
+      if (s && s.length > 0) return s;
+    }
+    return undefined;
+  };
+
   // KV cache usage (gauge, 0..1) — average across engines so the value
   // stays a fraction (each engine has its own KV pool).
-  const kvSeries =
-    metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series;
+  const kvSeries = pickSeries(
+    'vllm:kv_cache_usage_perc',
+    'vllm:gpu_cache_usage_perc',
+    'sglang:token_usage',
+  );
   const kvCacheUsage: TimeSeriesPoint[] = sortedEntries(
     aggregateByStart(kvSeries, 'avg', 'avg'),
   ).map(([t, v]) => ({ t: tOf(t), value: v }));
 
   // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across
-  // engines, joined on start_ns.
-  const hitsByT = aggregateByStart(metrics['vllm:prefix_cache_hits']?.series, 'rate', 'sum');
-  const qsByT = aggregateByStart(metrics['vllm:prefix_cache_queries']?.series, 'rate', 'sum');
+  // engines, joined on start_ns. SGLang names: cached_tokens / prompt_tokens.
+  const hitsSeries = pickSeries('vllm:prefix_cache_hits', 'sglang:cached_tokens');
+  const qsSeries = pickSeries(
+    'vllm:prefix_cache_queries',
+    'vllm:prompt_tokens',
+    'sglang:prompt_tokens',
+  );
+  const hitsByT = aggregateByStart(hitsSeries, 'rate', 'sum');
+  const qsByT = aggregateByStart(qsSeries, 'rate', 'sum');
   const prefixCacheHitRate: TimeSeriesPoint[] = [];
   for (const [t, h] of [...hitsByT.entries()].toSorted((a, b) => a[0] - b[0])) {
     const q = qsByT.get(t);
@@ -239,8 +273,10 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   }
 
   // Queue depth: sum running + waiting across engines per timeslice.
-  const runByT = aggregateByStart(metrics['vllm:num_requests_running']?.series, 'avg', 'sum');
-  const waitByT = aggregateByStart(metrics['vllm:num_requests_waiting']?.series, 'avg', 'sum');
+  const runSeries = pickSeries('vllm:num_requests_running', 'sglang:num_running_reqs');
+  const waitSeries = pickSeries('vllm:num_requests_waiting', 'sglang:num_queue_reqs');
+  const runByT = aggregateByStart(runSeries, 'avg', 'sum');
+  const waitByT = aggregateByStart(waitSeries, 'avg', 'sum');
   const queueDepth: QueueDepthPoint[] = [];
   // Union of timestamps so we surface activity even if one of the gauges
   // didn't report a sample on a given tick.
@@ -252,16 +288,19 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   }
 
   // Throughput: sum the counter `rate` (already per-second) across engines.
-  const counterRate = (name: string): TimeSeriesPoint[] =>
-    sortedEntries(aggregateByStart(metrics[name]?.series, 'rate', 'sum')).map(([t, v]) => ({
+  // Takes a fallback chain so vllm:* and sglang:* both work.
+  const counterRate = (...names: string[]): TimeSeriesPoint[] => {
+    const s = pickSeries(...names);
+    return sortedEntries(aggregateByStart(s, 'rate', 'sum')).map(([t, v]) => ({
       t: tOf(t),
       value: v,
     }));
-  const prefillTps = counterRate('vllm:prompt_tokens');
-  const decodeTps = counterRate('vllm:generation_tokens');
+  };
+  const prefillTps = counterRate('vllm:prompt_tokens', 'sglang:prompt_tokens');
+  const decodeTps = counterRate('vllm:generation_tokens', 'sglang:generation_tokens');
   // Tokens served from prefix cache per scrape. Lets the frontend derive
   // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits).
-  const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits');
+  const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens');
 
   // Per-source prompt tokens — sum across engines per source label.
   const promptBySrcByT = new Map<string, Map<number, number>>();
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
index 1ad7fd7f..da5d18a0 100644
--- a/packages/db/src/queries/agentic-aggregates.ts
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -32,8 +32,12 @@ import type { DbClient } from '../connection.js';
  *
  * v2: aggregate vllm gauges/counters across all engine series (was reading
  * only series[0], which under-counted by Nx on multi-engine DP/PP deployments).
+ *
+ * v3: extract sglang:* metrics too — kv_cache_util + prefix_cache_hit_rate
+ * populate for SGLang runs (qwen3.5/h100, mi355x sglang, etc.) the same way
+ * they do for vllm runs.
  */
-export const STATS_VERSION = 2;
+export const STATS_VERSION = 3;
 
 export interface MetricPercentiles {
   mean: number;
@@ -199,6 +203,18 @@ function aggregateSeriesByStart(
  * Aggregates across all engine series so multi-engine DP/PP deployments are
  * counted correctly (previously we only read engine 0).
  */
+/** First metric whose series array is non-empty; supports vllm/sglang fallback. */
+function pickFirstNonEmpty(
+  metrics: Record<string, MetricMeta>,
+  ...names: string[]
+): Series[] | undefined {
+  for (const name of names) {
+    const s = metrics[name]?.series;
+    if (s && s.length > 0) return s;
+  }
+  return undefined;
+}
+
 export function extractServerMetricSamples(json: string): {
   kvCacheUtil: number[];
   prefixCacheHitRate: number[];
@@ -208,17 +224,29 @@ export function extractServerMetricSamples(json: string): {
 
   // KV cache util — per-engine gauge in [0, 1]. Average across engines so the
   // value stays a percentage; summing would give meaningless 0..N.
-  const kvSeriesAll =
-    metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series;
+  const kvSeriesAll = pickFirstNonEmpty(
+    metrics,
+    'vllm:kv_cache_usage_perc',
+    'vllm:gpu_cache_usage_perc',
+    'sglang:token_usage',
+  );
   const kvCacheUtil = [...aggregateSeriesByStart(kvSeriesAll, 'avg', 'avg').values()];
 
   // Prefix cache hit rate per interval = Σhits.rate / Σqueries.rate across
-  // all engines. Sum first, then divide.
-  const hitsAll =
-    metrics['vllm:prefix_cache_hits']?.series ?? metrics['vllm:gpu_prefix_cache_hits']?.series;
-  const queriesAll =
-    metrics['vllm:prefix_cache_queries']?.series ??
-    metrics['vllm:gpu_prefix_cache_queries']?.series;
+  // all engines. Sum first, then divide. SGLang names: cached_tokens / prompt_tokens.
+  const hitsAll = pickFirstNonEmpty(
+    metrics,
+    'vllm:prefix_cache_hits',
+    'vllm:gpu_prefix_cache_hits',
+    'sglang:cached_tokens',
+  );
+  const queriesAll = pickFirstNonEmpty(
+    metrics,
+    'vllm:prefix_cache_queries',
+    'vllm:gpu_prefix_cache_queries',
+    'vllm:prompt_tokens',
+    'sglang:prompt_tokens',
+  );
   const hitsByT = aggregateSeriesByStart(hitsAll, 'rate', 'sum');
   const qByT = aggregateSeriesByStart(queriesAll, 'rate', 'sum');
   const prefixCacheHitRate: number[] = [];
@@ -232,12 +260,18 @@ export function extractServerMetricSamples(json: string): {
 
 /** Metrics our aggregates pipeline cares about. Anything else in the blob is skipped. */
 const TARGET_METRIC_KEYS = new Set([
+  // vLLM
   'vllm:kv_cache_usage_perc',
-  'vllm:gpu_cache_usage_perc', // older fallback name
+  'vllm:gpu_cache_usage_perc',
   'vllm:prefix_cache_hits',
   'vllm:prefix_cache_queries',
-  'vllm:gpu_prefix_cache_hits', // legacy alias (used in pre-fix code paths)
+  'vllm:gpu_prefix_cache_hits',
   'vllm:gpu_prefix_cache_queries',
+  'vllm:prompt_tokens',
+  // SGLang
+  'sglang:token_usage',
+  'sglang:cached_tokens',
+  'sglang:prompt_tokens',
 ]);
 
 /**

From 625d6e85e411cf8081977d3b76ad98d1805ad3c5 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 20:48:58 -0500
Subject: [PATCH 50/55] fix(ingest): derive GPU cache hit rate for SGLang at
 ingest time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SGLang runs' harness JSON doesn't populate server_gpu_cache_hit_rate
(vLLM runs do), so the detail-page header and inference chart tooltip
showed "—" for SGLang points. Now at trace_replay ingest, if any of
the linked benchmark_results rows has a null server_gpu_cache_hit_rate
and we have non-empty prefill/hits time-series in the computed
chart_series, derive the lifetime cluster ratio as
sum(hits.rate) / sum(prompt.rate) and write it into the row's metrics
JSONB.

Already-stored SGLang rows from runs 944/945 backfilled via a one-off
UPDATE earlier in this session (8 rows, mostly ~87-89% hit rate, one
high-conc outlier at 2.4%).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/trace-replay-ingest.ts | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index 8cc03f2a..8d1e01b8 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -100,4 +100,23 @@ export async function insertTraceReplay(
     set trace_replay_id = ${traceReplayId}
     where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
   `;
+
+  // Derive a lifetime GPU cache hit rate from chart_series for any linked
+  // row whose harness JSON didn't set one (SGLang runs don't populate
+  // server_gpu_cache_hit_rate; vLLM runs do). Skip when chart_series has
+  // no usable prefill data — leaves the field null in that case, matching
+  // legacy "no trace_replay" behavior.
+  if (chartSeries && chartSeries.prefillTps.length > 0) {
+    const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0);
+    const sumHits = chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0);
+    if (sumPrompts > 0) {
+      const rate = sumHits / sumPrompts;
+      await sql`
+        update benchmark_results
+        set metrics = jsonb_set(metrics, '{server_gpu_cache_hit_rate}', to_jsonb(${rate}::numeric))
+        where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
+          and (metrics->>'server_gpu_cache_hit_rate') is null
+      `;
+    }
+  }
 }

From aa76e9eca423d3ab2c7079ff28d74b70adefae1c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 28 May 2026 14:38:52 -0500
Subject: [PATCH 51/55] feat(chart-series): map sglang:realtime_tokens to
 promptTokensBySource
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "Cumulative prompt token source breakdown" chart was empty for
SGLang runs because the vllm-specific vllm:prompt_tokens_by_source
metric doesn't exist on SGLang. Maps sglang:realtime_tokens (which has
mode={prefill_cache, prefill_compute, decode}) into the same source
breakdown when no vllm series is present, filtered to prefill_* modes
(decode tokens are output throughput, not prompt-token volume).

CHART_SERIES_VERSION → 5. Backfilled 128 rows; SGLang rows from runs
944/946/947 now have prefill_cache + prefill_compute sources populated.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/compute-chart-series.ts | 31 ++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 86b79925..0807e238 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -31,8 +31,12 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  *
  * v4: extract sglang:* metrics too (fallback chain in each picker), so
  * SGLang runs populate the chart_series the same way vllm runs do.
+ *
+ * v5: map sglang:realtime_tokens (mode={prefill_cache,prefill_compute,decode})
+ * into promptTokensBySource so the cumulative prompt-token-source-breakdown
+ * chart shows useful splits for SGLang runs (filtered to prefill_* modes).
  */
-export const CHART_SERIES_VERSION = 4;
+export const CHART_SERIES_VERSION = 5;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -115,6 +119,7 @@ const CHART_METRIC_KEYS = new Set([
   'sglang:generation_tokens',
   'sglang:num_running_reqs',
   'sglang:num_queue_reqs',
+  'sglang:realtime_tokens',
 ]);
 
 /**
@@ -303,6 +308,12 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens');
 
   // Per-source prompt tokens — sum across engines per source label.
+  //   vllm: vllm:prompt_tokens_by_source has one series per source label
+  //         (local_cache_hit, external_cache_hit, miss, ...). Use the
+  //         `source`/`reason`/`kind` label as the breakdown key.
+  //   sglang: sglang:realtime_tokens uses a `mode` label with values
+  //         {prefill_cache, prefill_compute, decode}. Filter to prefill_*
+  //         since decode isn't prompt-token volume.
   const promptBySrcByT = new Map<string, Map<number, number>>();
   for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
     const labels = series.labels ?? {};
@@ -318,6 +329,24 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
       }
     }
   }
+  // SGLang fallback: only consider when the vllm metric wasn't found.
+  if (promptBySrcByT.size === 0) {
+    for (const series of metrics['sglang:realtime_tokens']?.series ?? []) {
+      const labels = series.labels ?? {};
+      const mode = labels['mode'] ?? 'unknown';
+      if (!mode.startsWith('prefill')) continue; // skip 'decode' (output tokens)
+      let byT = promptBySrcByT.get(mode);
+      if (!byT) {
+        byT = new Map<number, number>();
+        promptBySrcByT.set(mode, byT);
+      }
+      for (const ts of series.timeslices ?? []) {
+        if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
+          byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate);
+        }
+      }
+    }
+  }
   const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
   for (const [source, byT] of promptBySrcByT) {
     const arr: TimeSeriesPoint[] = [];

From 5872a3d8d3c6f5e6feee879e2f8f6f5d0ddd04ac Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 28 May 2026 14:48:27 -0500
Subject: [PATCH 52/55] feat(chart-series): break out SGLang cache hits by
 cache_source
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously SGLang detail pages showed two stacked-area layers in the
prompt-token source breakdown: prefill_cache (everything that hit the
cache) + prefill_compute (cache miss). The user wanted finer
granularity — specifically a distinction between on-GPU HBM cache and
CPU-offloaded (hicache) host cache.

SGLang's sglang:cached_tokens metric carries a cache_source label that
varies per cache tier:
  - "device" → on-GPU HBM cache hit
  - "host"   → CPU-offload (hicache) cache hit
  - "total"  → older sglang, single series with no tier breakdown

Switches the cache-hit portion of the breakdown from the coarse
`prefill_cache` mode label to per-cache_source series:
  - device → "cache hit (HBM)"
  - host   → "cache hit (CPU offload)"
  - total  → "cache hit"
  - other  → "cache hit (<src>)"

Cache misses still come from realtime_tokens[mode=prefill_compute],
relabeled "compute (miss)" for symmetry.

Current data only contains device/total (no hicache runs ingested
yet) — when hicache runs come in, the chart will automatically split
cache hits into HBM + CPU-offload layers with no further code change.

CHART_SERIES_VERSION → 6. Backfilled 128 rows.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/compute-chart-series.ts | 47 +++++++++++++++++++--
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 0807e238..1996708f 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -35,8 +35,13 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  * v5: map sglang:realtime_tokens (mode={prefill_cache,prefill_compute,decode})
  * into promptTokensBySource so the cumulative prompt-token-source-breakdown
  * chart shows useful splits for SGLang runs (filtered to prefill_* modes).
+ *
+ * v6: for SGLang, swap the coarse "prefill_cache" bucket for per-cache_source
+ * breakdown from sglang:cached_tokens — current runs always have one
+ * cache_source ("device" / HBM) but hicache (CPU offload) runs would
+ * split into "device" + "host" automatically once ingested.
  */
-export const CHART_SERIES_VERSION = 5;
+export const CHART_SERIES_VERSION = 6;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -330,15 +335,49 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     }
   }
   // SGLang fallback: only consider when the vllm metric wasn't found.
+  //   - Cache misses (fresh prefill): `sglang:realtime_tokens[mode=prefill_compute]`
+  //   - Cache hits, split by tier: per-series `sglang:cached_tokens` where each
+  //     series carries a `cache_source` label ("device" = HBM, "host" = CPU
+  //     offload via hicache). Current runs have only `device`; when hicache
+  //     runs land, additional series will appear and the chart will split.
   if (promptBySrcByT.size === 0) {
     for (const series of metrics['sglang:realtime_tokens']?.series ?? []) {
       const labels = series.labels ?? {};
       const mode = labels['mode'] ?? 'unknown';
-      if (!mode.startsWith('prefill')) continue; // skip 'decode' (output tokens)
-      let byT = promptBySrcByT.get(mode);
+      // Only carry the cache-miss line over — cache hits come from
+      // sglang:cached_tokens broken out by cache_source below, so we'd
+      // double-count if we kept `prefill_cache` here too.
+      if (mode !== 'prefill_compute') continue;
+      const label = 'compute (miss)';
+      let byT = promptBySrcByT.get(label);
+      if (!byT) {
+        byT = new Map<number, number>();
+        promptBySrcByT.set(label, byT);
+      }
+      for (const ts of series.timeslices ?? []) {
+        if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
+          byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate);
+        }
+      }
+    }
+    // Cache hits broken out per cache_source. Strip the noisy "total" label
+    // (older sglang versions emit a single un-broken-out series labelled
+    // total — show that as just "cache hit").
+    for (const series of metrics['sglang:cached_tokens']?.series ?? []) {
+      const labels = series.labels ?? {};
+      const src = labels['cache_source'] ?? 'cache hit';
+      const label =
+        src === 'device'
+          ? 'cache hit (HBM)'
+          : src === 'host'
+            ? 'cache hit (CPU offload)'
+            : src === 'total'
+              ? 'cache hit'
+              : `cache hit (${src})`;
+      let byT = promptBySrcByT.get(label);
       if (!byT) {
         byT = new Map<number, number>();
-        promptBySrcByT.set(mode, byT);
+        promptBySrcByT.set(label, byT);
       }
       for (const ts of series.timeslices ?? []) {
         if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {

From 94a3e8b1986e54165c062e2a14eda60d9e9dd146 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 28 May 2026 15:01:24 -0500
Subject: [PATCH 53/55] feat(chart-series): host cache util line + fix SGLang
 stacked-area colors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two related fixes for SGLang hicache rendering on the agentic detail page:

1. KV cache utilization chart was GPU-HBM-only. SGLang hicache runs also
   expose sglang:hicache_host_{used,total}_tokens — the CPU offload
   pool's tokens-in-use over its capacity. Extracted as a new
   `hostKvCacheUsage` time series; frontend overlays it as a second
   orange line on the existing chart when the row has hicache data.

2. The cumulative-prompt-token-source-breakdown chart rendered ALL
   three SGLang sources in the same color, because the colors dict
   only knew vllm-style names (local_compute, local_cache_hit, etc.).
   Added explicit colors for the SGLang label names ('cache hit
   (HBM)', 'cache hit (CPU offload)', 'cache hit', 'compute (miss)')
   plus a memoized fallback palette so any future unknown source name
   gets a distinct color rather than falling through to gray.

CHART_SERIES_VERSION → 7. Backfilled 128 rows; hicache rows from
workflow_run 947 (8 rows) now have ~1830 hostKvCacheUsage samples
matching their HBM samples.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    | 16 ++++++++-
 .../agentic-point/time-series-chart.tsx       | 30 ++++++++++++++--
 .../src/hooks/api/use-trace-server-metrics.ts |  2 ++
 packages/db/src/etl/compute-chart-series.ts   | 36 ++++++++++++++++++-
 .../db/src/queries/trace-server-metrics.ts    |  3 ++
 5 files changed, 83 insertions(+), 4 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 2db2809b..b047ea8f 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -236,16 +236,30 @@ export function AgenticPointDetail({ id }: Props) {
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
               if (!metrics) return <Skeleton />;
+              // For SGLang hicache rows we have both GPU (HBM) util and
+              // host (CPU offload pool) util — overlay them as two lines.
+              const hasHost = metrics.hostKvCacheUsage.length > 0;
               return (
                 <TimeSeriesChart
                   series={[
                     {
-                      name: 'GPU KV cache (avg n=50)',
+                      name: hasHost ? 'GPU HBM (avg n=50)' : 'GPU KV cache (avg n=50)',
                       data: rollingAverage(metrics.kvCacheUsage, 50),
                       rawData: metrics.kvCacheUsage,
                       color: '#3b82f6',
                       strokeWidth: 2,
                     },
+                    ...(hasHost
+                      ? [
+                          {
+                            name: 'CPU offload pool (avg n=50)',
+                            data: rollingAverage(metrics.hostKvCacheUsage, 50),
+                            rawData: metrics.hostKvCacheUsage,
+                            color: '#f97316',
+                            strokeWidth: 2,
+                          },
+                        ]
+                      : []),
                   ]}
                   durationS={metrics.durationS}
                   yMax={1}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 520b3ed6..15a15869 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -485,10 +485,16 @@ export function StackedAreaChart({
   }, [sourceSeries]);
 
   const colors: Record<string, string> = {
+    // vLLM source names
     local_compute: '#f97316',
     local_cache_hit: '#3b82f6',
     external_kv_transfer: '#22c55e',
     miss: '#f97316',
+    // SGLang source names (set by compute-chart-series for sglang rows)
+    'cache hit (HBM)': '#3b82f6',
+    'cache hit (CPU offload)': '#22c55e',
+    'cache hit': '#3b82f6',
+    'compute (miss)': '#f97316',
   };
   const labelFor: Record<string, string> = {
     local_compute: 'Prefill',
@@ -496,6 +502,26 @@ export function StackedAreaChart({
     external_kv_transfer: 'Offload Cache Hit',
     miss: 'Miss',
   };
+  // Fallback palette for any source name not in `colors` so we never
+  // emit two layers in the same shade. Cycles by insertion order.
+  const fallbackPalette = [
+    '#3b82f6',
+    '#f97316',
+    '#22c55e',
+    '#a855f7',
+    '#ef4444',
+    '#06b6d4',
+    '#f59e0b',
+    '#ec4899',
+  ];
+  let fallbackIdx = 0;
+  const colorFor = (name: string): string => {
+    if (colors[name]) return colors[name]!;
+    const c = fallbackPalette[fallbackIdx % fallbackPalette.length]!;
+    fallbackIdx++;
+    colors[name] = c; // memoize so the SAME unknown name always gets the same color
+    return c;
+  };
 
   if (!computed) {
     return (
@@ -522,7 +548,7 @@ export function StackedAreaChart({
       .toReversed()
       .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`)
       .join(' ')} Z`;
-    const color = colors[name] ?? '#6b7280';
+    const color = colorFor(name);
     for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!;
     return { name, color, d };
   });
@@ -540,7 +566,7 @@ export function StackedAreaChart({
       }
     }
     const items: HoverItem[] = stackOrder.map((name) => ({
-      color: colors[name] ?? '#6b7280',
+      color: colorFor(name),
       label: labelFor[name] ?? name,
       value: `${((shares[name]?.[idx] ?? 0) * 100).toFixed(1)}%`,
     }));
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
index 664bc6c7..bac67a50 100644
--- a/packages/app/src/hooks/api/use-trace-server-metrics.ts
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -44,6 +44,8 @@ export interface TraceServerMetrics {
   decodeTps: TimeSeriesPoint[];
   /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */
   prefixCacheHitsTps: TimeSeriesPoint[];
+  /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */
+  hostKvCacheUsage: TimeSeriesPoint[];
 }
 
 async function fetchTraceServerMetrics(
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 1996708f..8105961e 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -40,8 +40,12 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  * breakdown from sglang:cached_tokens — current runs always have one
  * cache_source ("device" / HBM) but hicache (CPU offload) runs would
  * split into "device" + "host" automatically once ingested.
+ *
+ * v7: extract sglang:hicache_host_{used,total}_tokens into a new
+ * hostKvCacheUsage series so the KV cache utilization chart can plot
+ * the CPU offload pool's usage alongside the on-GPU HBM line.
  */
-export const CHART_SERIES_VERSION = 6;
+export const CHART_SERIES_VERSION = 7;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -79,6 +83,12 @@ export interface ChartSeries {
    * saved vs the raw queries that came in.
    */
   prefixCacheHitsTps: TimeSeriesPoint[];
+  /**
+   * Host (CPU offload) KV cache utilization, 0..1. Only populated for
+   * SGLang hicache runs (derived as hicache_host_used / hicache_host_total).
+   * Frontend overlays this on the KV cache util chart as a second line.
+   */
+  hostKvCacheUsage: TimeSeriesPoint[];
 }
 
 // ── Raw blob shapes (subset we read) ────────────────────────────────────
@@ -125,6 +135,8 @@ const CHART_METRIC_KEYS = new Set([
   'sglang:num_running_reqs',
   'sglang:num_queue_reqs',
   'sglang:realtime_tokens',
+  'sglang:hicache_host_used_tokens',
+  'sglang:hicache_host_total_tokens',
 ]);
 
 /**
@@ -312,6 +324,27 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits).
   const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens');
 
+  // SGLang hicache: host-pool KV cache utilization as used/total per
+  // timeslice. Both metrics are gauges in absolute tokens. Total stays
+  // constant (it's the pool size), used fluctuates.
+  const hostUsedByT = aggregateByStart(
+    metrics['sglang:hicache_host_used_tokens']?.series,
+    'avg',
+    'sum',
+  );
+  const hostTotalByT = aggregateByStart(
+    metrics['sglang:hicache_host_total_tokens']?.series,
+    'avg',
+    'sum',
+  );
+  const hostKvCacheUsage: TimeSeriesPoint[] = [];
+  for (const [t, used] of [...hostUsedByT.entries()].toSorted((a, b) => a[0] - b[0])) {
+    const total = hostTotalByT.get(t);
+    if (total !== undefined && total > 0) {
+      hostKvCacheUsage.push({ t: tOf(t), value: used / total });
+    }
+  }
+
   // Per-source prompt tokens — sum across engines per source label.
   //   vllm: vllm:prompt_tokens_by_source has one series per source label
   //         (local_cache_hit, external_cache_hit, miss, ...). Use the
@@ -407,5 +440,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     prefillTps,
     decodeTps,
     prefixCacheHitsTps,
+    hostKvCacheUsage,
   };
 }
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
index 76775e77..eccb0a0c 100644
--- a/packages/db/src/queries/trace-server-metrics.ts
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -73,6 +73,8 @@ export interface TraceServerMetrics {
   decodeTps: TimeSeriesPoint[];
   /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */
   prefixCacheHitsTps: TimeSeriesPoint[];
+  /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */
+  hostKvCacheUsage: TimeSeriesPoint[];
 }
 
 interface RawMetaRow extends PointMeta {
@@ -118,6 +120,7 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics {
     decodeTps: series.decodeTps,
     // v2 chart_series rows pre-backfill don't have this field — default to []
     prefixCacheHitsTps: series.prefixCacheHitsTps ?? [],
+    hostKvCacheUsage: series.hostKvCacheUsage ?? [],
   };
 }
 

From 93e197b7e54d140acfe65b61aeb4f5c48ca27091 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 28 May 2026 15:19:20 -0500
Subject: [PATCH 54/55] fix(stacked-area): align sources by timestamp before
 computing shares
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cumulative-prompt-token-source-breakdown chart was showing huge
"100% compute (miss)" plateaus around minute 20-24 of many SGLang runs.

Root cause: the chart computed cumulative shares per ARRAY INDEX (not
timestamp), but in SGLang's per-scrape metrics, cache hits and misses
fire on different ticks — one scrape reports 193K hits + 0 miss, the
next reports 0 hits + 8K miss. So each source has a different timestamp
array. Indexing them in lockstep mixed values from different moments
and made the share calculation flap to 100% one side or the other.

Fix: union timestamps across all sources, then for each unique
timestamp carry forward each source's cumulative sum (a source that
didn't report at time t holds its previous cumulative value rather
than appearing as 0).

After fix: shares change smoothly over time as each source's cumulative
sum grows; transient single-tick gaps no longer drive the visible
share to either extreme.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/time-series-chart.tsx       | 31 ++++++++++++++++---
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 15a15869..75d7bb1e 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -464,15 +464,36 @@ export function StackedAreaChart({
   const computed = useMemo(() => {
     const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0);
     if (entries.length === 0) return null;
-    const tValues = entries[0]![1].map((p) => p.t);
+
+    // Different sources can land on different scrape timestamps
+    // (SGLang's hits/misses fire on alternating ticks), so we MUST
+    // align across all sources before computing shares — otherwise the
+    // share calculation indexes into each source's own time axis and
+    // mixes values from different moments.
+    //
+    // Approach: union all timestamps across sources, then for each
+    // unique timestamp carry forward the cumulative sum for every
+    // source (a source that didn't report at time t holds its previous
+    // cumulative value rather than dropping to 0).
+    const tValues = [...new Set(entries.flatMap(([, arr]) => arr.map((p) => p.t)))].toSorted(
+      (a, b) => a - b,
+    );
+
+    // For each source, walk its (sorted) array and produce a parallel
+    // cumulative-sum array indexed against `tValues` via carry-forward.
     const cum: Record<string, number[]> = {};
     for (const [name, arr] of entries) {
+      const valByT = new Map(arr.map((p) => [p.t, p.value]));
+      const out: number[] = Array.from({ length: tValues.length });
       let acc = 0;
-      cum[name] = arr.map((p) => {
-        acc += p.value;
-        return acc;
-      });
+      for (let i = 0; i < tValues.length; i++) {
+        const v = valByT.get(tValues[i]!);
+        if (v !== undefined) acc += v;
+        out[i] = acc;
+      }
+      cum[name] = out;
     }
+
     const shares: Record<string, number[]> = {};
     for (const name of Object.keys(cum)) shares[name] = [];
     for (let i = 0; i < tValues.length; i++) {

From c14e19e277930495e4a43c3a6d6f42a611fec336 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 28 May 2026 15:44:07 -0500
Subject: [PATCH 55/55] fix(ingest): split GPU vs CPU cache hit rate for SGLang
 hicache rows

Previous inline derivation (commit 625d6e8) summed ALL cache hit
sources into server_gpu_cache_hit_rate, which conflated GPU HBM hits
with CPU offload hits on SGLang hicache rows. The harness JSON also
never sets server_cpu_cache_hit_rate.

Now derives both metrics from chart_series.promptTokensBySource:
  server_gpu_cache_hit_rate = sum(HBM + 'cache hit') / sum(prompts)
  server_cpu_cache_hit_rate = sum(CPU offload) / sum(prompts) or null
                              (null when no CPU offload source exists)

Falls back to prefixCacheHitsTps for vLLM rows where promptTokensBySource
isn't broken out by cache source. Overwrites any pre-existing value so
the derivation stays consistent with what the detail-page charts plot.

Backfilled all existing rows via two-phase SQL update earlier in the
session:
  - 8 hicache rows in workflow_run 947 now show GPU ~1-2% / CPU ~87-91%
  - Other SGLang rows show GPU ~87% / CPU null
  - vLLM rows restored to their original GPU hit rates

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/trace-replay-ingest.ts | 40 +++++++++++++++++-----
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index 8d1e01b8..43655d9a 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -101,21 +101,43 @@ export async function insertTraceReplay(
     where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
   `;
 
-  // Derive a lifetime GPU cache hit rate from chart_series for any linked
-  // row whose harness JSON didn't set one (SGLang runs don't populate
-  // server_gpu_cache_hit_rate; vLLM runs do). Skip when chart_series has
-  // no usable prefill data — leaves the field null in that case, matching
-  // legacy "no trace_replay" behavior.
+  // Derive lifetime GPU + CPU cache hit rates from chart_series. SGLang
+  // runs don't populate these in the harness JSON; vLLM runs do but only
+  // for GPU. We always recompute to keep the derivation consistent with
+  // what the detail-page charts plot — overwriting any pre-existing value.
+  //
+  // For hicache (CPU offload) rows the chart_series.promptTokensBySource
+  // breakdown has separate "cache hit (HBM)" + "cache hit (CPU offload)"
+  // sources, letting us split GPU vs CPU hit rate. Other rows just have
+  // a single cache-hit source (either "cache hit (HBM)" / "cache hit"
+  // for sglang, or no breakdown for vllm — falls back to prefixCacheHitsTps
+  // sum which equals the single cache source's total).
   if (chartSeries && chartSeries.prefillTps.length > 0) {
     const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0);
-    const sumHits = chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0);
     if (sumPrompts > 0) {
-      const rate = sumHits / sumPrompts;
+      const sumOf = (name: string): number =>
+        (chartSeries.promptTokensBySource[name] ?? []).reduce((s, p) => s + p.value, 0);
+      const cpuHits = sumOf('cache hit (CPU offload)');
+      const hbmFromBreakdown = sumOf('cache hit (HBM)') + sumOf('cache hit');
+      // If the source breakdown has a HBM entry, use it (covers SGLang).
+      // Otherwise fall back to total prefixCacheHitsTps sum (vLLM path).
+      const gpuHits =
+        hbmFromBreakdown > 0
+          ? hbmFromBreakdown
+          : chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0);
+      const gpuRate = gpuHits / sumPrompts;
+      const cpuRate = cpuHits > 0 ? cpuHits / sumPrompts : null;
       await sql`
         update benchmark_results
-        set metrics = jsonb_set(metrics, '{server_gpu_cache_hit_rate}', to_jsonb(${rate}::numeric))
+        set metrics = jsonb_set(
+          case when ${cpuRate}::numeric is not null
+            then jsonb_set(metrics, '{server_cpu_cache_hit_rate}', to_jsonb(${cpuRate}::numeric))
+            else metrics
+          end,
+          '{server_gpu_cache_hit_rate}',
+          to_jsonb(${gpuRate}::numeric)
+        )
         where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
-          and (metrics->>'server_gpu_cache_hit_rate') is null
       `;
     }
   }