From 0e35e5f0b10c2c9db10094031a2ac92e59fff9f3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 23 Apr 2026 13:40:27 -0500 Subject: [PATCH 01/55] feat: agentic benchmark ingest + UI with offload-mode halo Adds agentic_traces scenario end-to-end: - Schema migrations for agentic scenario, availability, and KV offload mode - DB ingest/ETL + query updates to carry scenario, offload_mode, and server/theoretical cache-hit rates through to the API layer - Frontend types, filters (GlobalFilterContext / InferenceContext / ChartControls), URL state, and tooltip rows for agentic-only fields - ScatterGraph: subtle dashed halo on Pareto-frontier points that used KV offload so the tradeoff is visible at a glance --- packages/app/cypress/support/mock-data.ts | 2 + .../app/src/app/api/unofficial-run/route.ts | 2 + .../src/components/GlobalFilterContext.tsx | 12 +- .../components/inference/InferenceContext.tsx | 15 ++- .../inference/hooks/useChartData.ts | 34 +++-- .../app/src/components/inference/types.ts | 26 ++++ .../components/inference/ui/ChartControls.tsx | 27 +++- .../components/inference/ui/ScatterGraph.tsx | 21 +++ .../inference/utils/tooltipUtils.ts | 54 +++++++- .../app/src/components/ui/chart-selectors.tsx | 124 ++++++++++++++++++ .../unofficial-run-provider.test.ts | 2 + .../components/unofficial-run-provider.tsx | 4 +- packages/app/src/lib/api.ts | 14 +- .../app/src/lib/benchmark-transform.test.ts | 2 + packages/app/src/lib/benchmark-transform.ts | 65 ++++++++- packages/app/src/lib/data-mappings.ts | 72 +++++++++- packages/app/src/lib/url-state.ts | 2 + packages/constants/src/models.ts | 17 +++ .../db/migrations/002_agentic_scenario.sql | 30 +++++ .../migrations/003_agentic_availability.sql | 21 +++ packages/db/migrations/004_offload_mode.sql | 42 ++++++ packages/db/src/etl/benchmark-ingest.ts | 28 ++-- packages/db/src/etl/benchmark-mapper.ts | 45 ++++++- packages/db/src/ingest-ci-run.ts | 6 +- packages/db/src/ingest-gcs-backup.ts | 6 +- packages/db/src/ingest-supplemental.ts | 14 +- packages/db/src/json-provider.ts | 8 +- packages/db/src/queries/benchmarks.ts | 13 +- packages/db/src/queries/workflow-info.ts | 15 ++- 29 files changed, 645 insertions(+), 78 deletions(-) create mode 100644 packages/db/migrations/002_agentic_scenario.sql create mode 100644 packages/db/migrations/003_agentic_availability.sql create mode 100644 packages/db/migrations/004_offload_mode.sql diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts index e6720c0b..7a4f59a9 100644 --- a/packages/app/cypress/support/mock-data.ts +++ b/packages/app/cypress/support/mock-data.ts @@ -189,6 +189,8 @@ export function createMockInferenceContext( workflowInfo: null, selectedYAxisMetric: 'y_tpPerGpu', setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'), + selectedPercentile: 'median', + setSelectedPercentile: namedStub('setSelectedPercentile'), selectedXAxisMetric: null, setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'), selectedE2eXAxisMetric: null, diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts index 79ac0665..dbfb9c33 100644 --- a/packages/app/src/app/api/unofficial-run/route.ts +++ b/packages/app/src/app/api/unofficial-run/route.ts @@ -49,6 +49,8 @@ export function normalizeArtifactRows( decode_num_workers: config.decodeNumWorkers, num_prefill_gpu: config.numPrefillGpu, num_decode_gpu: config.numDecodeGpu, + benchmark_type: params.benchmarkType, + offload_mode: params.offloadMode, isl: params.isl, osl: params.osl, conc: params.conc, diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index 65f510cd..f603081a 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -11,7 +11,7 @@ import { useState, } from 'react'; -import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants'; +import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants'; import { useAvailability } from '@/hooks/api/use-availability'; import { useWorkflowInfo } from '@/hooks/api/use-workflow-info'; @@ -172,11 +172,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) { const availableSequences = useMemo(() => { if (!availabilityRows) return SEQUENCE_OPTIONS; const seqs = [ - ...new Set( - modelRows - .map((r) => islOslToSequence(r.isl, r.osl)) - .filter((s): s is Sequence => s !== null), - ), + ...new Set(modelRows.map((r) => rowToSequence(r)).filter((s): s is Sequence => s !== null)), ]; return seqs.length > 0 ? seqs : SEQUENCE_OPTIONS; }, [availabilityRows, modelRows]); @@ -190,7 +186,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) { // Precisions available for the selected model + sequence const availablePrecisions = useMemo(() => { if (!availabilityRows) return ['fp4']; - const rows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence); + const rows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence); const precs = [...new Set(rows.map((r) => r.precision))].toSorted(); return precs.length > 0 ? precs : ['fp4']; }, [availabilityRows, modelRows, effectiveSequence]); @@ -205,7 +201,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) { // Dates available for selected model + sequence + precisions const availableDates = useMemo(() => { if (!availabilityRows) return []; - const seqRows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence); + const seqRows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence); const rows = seqRows.filter((r) => effectivePrecisions.includes(r.precision)); if (rows.length === 0) { return [...new Set(seqRows.map((r) => r.date))].toSorted(); diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index 7fa416fd..6f45d8d7 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -11,7 +11,7 @@ import { useState, } from 'react'; -import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants'; +import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants'; import { track } from '@/lib/analytics'; import { FAVORITE_PRESETS, type FavoritePreset } from '@/components/favorites/favorite-presets'; @@ -110,6 +110,11 @@ export function InferenceProvider({ const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState( () => getUrlParam('i_e2e_xmetric') || null, ); + // Latency percentile applied to the chart x-axis for agentic scenarios. + // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore. + const [selectedPercentile, setSelectedPercentile] = useState( + () => getUrlParam('i_pctl') || 'median', + ); const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>( () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto', ); @@ -163,6 +168,7 @@ export function InferenceProvider({ effectiveRunDate, isActive, latestDate, + selectedPercentile, ); // For GPU comparison date picker — use shared availability data from global filters @@ -176,7 +182,7 @@ export function InferenceProvider({ if (!availabilityRows) return availableDates; const rows = availabilityRows.filter((r) => { if (!dbModelKeys.includes(r.model)) return false; - if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) return false; + if (rowToSequence(r) !== effectiveSequence) return false; if (!effectivePrecisions.includes(r.precision)) return false; if (!r.hardware) return false; const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg); @@ -201,7 +207,7 @@ export function InferenceProvider({ const hwKeys = new Set(); for (const r of availabilityRows) { if (!dbModelKeys.includes(r.model)) continue; - if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) continue; + if (rowToSequence(r) !== effectiveSequence) continue; if (!effectivePrecisions.includes(r.precision)) continue; if (!r.hardware) continue; const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg); @@ -589,6 +595,7 @@ export function InferenceProvider({ useUrlStateSync( { i_metric: selectedYAxisMetric, + i_pctl: selectedPercentile, i_gpus: selectedGPUs.join(','), i_dates: selectedDates.join(','), i_dstart: selectedDateRange.startDate, @@ -783,6 +790,8 @@ export function InferenceProvider({ workflowInfo, selectedYAxisMetric, setSelectedYAxisMetric: setSelectedYAxisMetricAndClear, + selectedPercentile, + setSelectedPercentile, selectedGPUs, setSelectedGPUs: setSelectedGPUsAndClear, availableGPUs, diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index 625e63ab..81ab0780 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -1,7 +1,7 @@ import { useMemo, useRef } from 'react'; import { useQueries } from '@tanstack/react-query'; -import { sequenceToIslOsl } from '@semianalysisai/inferencex-constants'; +import { rowToSequence } from '@semianalysisai/inferencex-constants'; import chartDefinitions from '@/components/inference/inference-chart-config.json'; import type { @@ -15,7 +15,7 @@ import type { import { filterDataByCostLimit } from '@/components/inference/utils'; import { useBenchmarks, benchmarkQueryOptions } from '@/hooks/api/use-benchmarks'; import { GPU_ALIAS_TO_CANONICAL, getModelSortIndex } from '@/lib/constants'; -import { transformBenchmarkRows } from '@/lib/benchmark-transform'; +import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform'; import type { Model, Sequence } from '@/lib/data-mappings'; import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils'; @@ -79,6 +79,7 @@ export function useChartData( selectedRunDate?: string, enabled = true, latestAvailableDate?: string, + selectedPercentile = 'median', ) { // When the selected date is the latest available, use '' (empty string) to match // the initial no-date query key, reusing the eagerly-fetched benchmarks from the @@ -119,11 +120,13 @@ export function useChartData( // Merge main rows with comparison date rows. // Stamp each row with the *requested* date (not the actual DB date) so that // GPUGraph's activeDates filter (keyed by user-selected date) matches the points. - const sequenceIslOsl = useMemo(() => sequenceToIslOsl(selectedSequence), [selectedSequence]); + // + // rowToSequence handles both fixed-seq (via isl/osl) and agentic (via + // benchmark_type), so one filter covers every scenario. const rows = useMemo(() => { - if (!allRows || !sequenceIslOsl) return []; - const seqFilter = (r: { isl: number; osl: number }) => - r.isl === sequenceIslOsl.isl && r.osl === sequenceIslOsl.osl; + if (!allRows) return []; + const seqFilter = (r: { isl: number | null; osl: number | null; benchmark_type: string }) => + rowToSequence(r) === selectedSequence; const seqFiltered = allRows.filter(seqFilter); // For each (hw, framework, spec_method, disagg, precision) group, keep only @@ -150,14 +153,14 @@ export function useChartData( .map((r) => ({ ...r, date: comparisonDates[i], actualDate: r.date })), ); return [...mainRows, ...extraRows]; - }, [allRows, sequenceIslOsl, comparisonDates, comparisonDataKey, selectedRunDate]); + }, [allRows, selectedSequence, comparisonDates, comparisonDataKey, selectedRunDate]); // Transform filtered rows into chart data const { chartData, hardwareConfig: rawHardwareConfig } = useMemo(() => { if (rows.length === 0) return { chartData: [] as InferenceData[][], hardwareConfig: {} as HardwareConfig }; - return transformBenchmarkRows(rows); - }, [rows]); + return transformBenchmarkRows(rows, selectedPercentile); + }, [rows, selectedPercentile]); // Sort hardware config — stabilize reference when keys haven't changed. // Different sequences for the same model often have the same GPU configs, @@ -192,8 +195,11 @@ export function useChartData( (chartDefinitions as ChartDefinition[]).map((chartDef) => { const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey; - // Determine dynamic x-axis - let xAxisField: keyof AggDataEntry = chartDef.x; + // Default x-axis = chart's natural latency metric, percentile-adjusted + // for the agentic case (median_e2el → p99_e2el etc.). For non-agentic + // scenarios `withPercentile` is a no-op when percentile === 'median'. + const naturalX = withPercentile(chartDef.x, selectedPercentile) as keyof AggDataEntry; + let xAxisField: keyof AggDataEntry = naturalX; let xAxisLabel = chartDef.x_label; const metricTitle = @@ -232,8 +238,10 @@ export function useChartData( // (e.g. interactivity → TTFT: "higher is better" → "lower is better"). // E2EL → TTFT keeps the same direction ("lower is better" for both), // so no roofline flip is needed for the e2e chart. + // Compare against `naturalX` (percentile-adjusted) — switching the + // percentile of the same logical metric is NOT a flip. const xAxisFlipped = - xAxisField !== chartDef.x && !(chartDef.chartType === 'e2e' && isTtftOverride); + xAxisField !== naturalX && !(chartDef.chartType === 'e2e' && isTtftOverride); const yLabelKey = `${selectedYAxisMetric}_label` as keyof ChartDefinition; const dynamicYLabel = chartDef[yLabelKey]; @@ -261,7 +269,7 @@ export function useChartData( xAxisField, }; }), - [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric], + [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, selectedPercentile], ); // Build renderable graphs (data processing + stable chart definitions) diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts index a23707ba..53c8d84c 100644 --- a/packages/app/src/components/inference/types.ts +++ b/packages/app/src/components/inference/types.ts @@ -88,6 +88,29 @@ export interface AggDataEntry { actualDate?: string; /** URL to the GitHub Actions workflow run that produced this data point. */ run_url?: string; + /** Benchmark scenario: `single_turn` (fixed-seq isl/osl) or `agentic_traces`. */ + benchmark_type?: string; + /** ISL in tokens — null for agentic_traces. */ + isl?: number | null; + /** OSL in tokens — null for agentic_traces. */ + osl?: number | null; + // ── Agentic-only fields (populated from metrics JSONB for `agentic_traces` rows) ── + /** "on" | "off" — whether KV cache offload to CPU was enabled. */ + offload_mode?: string; + /** Actual server-observed GPU prefix-cache hit rate (0..1). */ + server_gpu_cache_hit_rate?: number; + /** Actual server-observed CPU prefix-cache hit rate (0..1). */ + server_cpu_cache_hit_rate?: number; + /** Infinite-cache theoretical hit rate (0..1) computed from trace. */ + theoretical_cache_hit_rate?: number; + /** Total requests attempted during the window. */ + num_requests_total?: number; + /** Requests that completed successfully. */ + num_requests_successful?: number; + /** Total prompt tokens served. */ + total_prompt_tokens?: number; + /** Total generated (output) tokens. */ + total_generation_tokens?: number; } /** @@ -468,6 +491,9 @@ export interface InferenceChartContextType { workflowInfo: any; selectedYAxisMetric: string; setSelectedYAxisMetric: (metric: string) => void; + /** Latency percentile for the x-axis under agentic scenarios (median/p90/p99/p99.9). */ + selectedPercentile: string; + setSelectedPercentile: (p: string) => void; selectedXAxisMetric: string | null; setSelectedXAxisMetric: (metric: string | null) => void; selectedE2eXAxisMetric: string | null; diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx index 5f8e7787..e4f55ad7 100644 --- a/packages/app/src/components/inference/ui/ChartControls.tsx +++ b/packages/app/src/components/inference/ui/ChartControls.tsx @@ -1,11 +1,14 @@ 'use client'; +import { useEffect, useState } from 'react'; + import { track } from '@/lib/analytics'; import { useInference } from '@/components/inference/InferenceContext'; import { ModelSelector, - SequenceSelector, + ScenarioSelector, + PercentileSelector, PrecisionSelector, } from '@/components/ui/chart-selectors'; import { DateRangePicker } from '@/components/ui/date-range-picker'; @@ -23,7 +26,7 @@ import { import { TooltipProvider } from '@/components/ui/tooltip'; import chartDefinitions from '@/components/inference/inference-chart-config.json'; import type { ChartDefinition } from '@/components/inference/types'; -import type { Model, Sequence } from '@/lib/data-mappings'; +import { Sequence, type Model, type Percentile } from '@/lib/data-mappings'; // Build Y-axis metric options from static chart config JSON — available immediately, no API wait const METRIC_GROUPS = [ @@ -78,6 +81,13 @@ interface ChartControlsProps { } export default function ChartControls({ hideGpuComparison = false }: ChartControlsProps) { + // The percentile selector is rendered conditionally on `selectedSequence`, + // which on the client is hydrated from URL params. SSR doesn't see the URL, + // so deferring the conditional until after mount keeps the initial DOM + // identical between server and client (avoids hydration warnings). + const [mounted, setMounted] = useState(false); + useEffect(() => setMounted(true), []); + const { selectedModel, setSelectedModel, @@ -87,6 +97,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro setSelectedPrecisions, selectedYAxisMetric, setSelectedYAxisMetric, + selectedPercentile, + setSelectedPercentile, graphs, selectedGPUs, setSelectedGPUs, @@ -203,12 +215,19 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro availableModels={availableModels} data-testid="model-selector" /> - + {mounted && selectedSequence === Sequence.AgenticTraces && ( + setSelectedPercentile(p)} + data-testid="percentile-selector" + /> + )} ('.dot-group').each(function (d) { + const onFrontier = optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`); + const showHalo = onFrontier && d.offload_mode === 'on'; + d3.select(this) + .selectAll('.offload-halo') + .data(showHalo ? [true] : []) + .join('circle') + .attr('class', 'offload-halo') + .attr('r', POINT_SIZE + 4) + .attr('fill', 'none') + .attr('stroke', 'var(--foreground)') + .attr('stroke-width', 1.5) + .attr('stroke-dasharray', '3 2') + .attr('opacity', 0.9) + .attr('pointer-events', 'none'); + }); + // Double-click to track/untrack zoomGroup .selectAll('.dot-group') @@ -1567,6 +1585,9 @@ const ScatterGraph = React.memo( chartDefinition.chartType, xScaleConfig._isLog, yScaleConfig.type, + optimalPointKeys, + getCssColor, + resolveColor, ], ); diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts index e88e9930..7391225e 100644 --- a/packages/app/src/components/inference/utils/tooltipUtils.ts +++ b/packages/app/src/components/inference/utils/tooltipUtils.ts @@ -88,6 +88,51 @@ const runLinkHTML = (runUrl?: string) => const tooltipLine = (label: string, value: string | number) => `
${label}: ${value}
`; +const formatPct = (v: number | undefined): string | null => + v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`; + +/** + * Agentic-only tooltip rows: offload mode, KV cache hit rates, request + * success, token totals. Returns an empty string for non-agentic rows. + */ +const generateAgenticHTML = (d: InferenceData): string => { + if (d.benchmark_type !== 'agentic_traces') return ''; + + const parts: string[] = []; + if (d.offload_mode) { + parts.push(tooltipLine('Offload Mode', d.offload_mode.toUpperCase())); + } + + const gpuHit = formatPct(d.server_gpu_cache_hit_rate); + const cpuHit = formatPct(d.server_cpu_cache_hit_rate); + const theoHit = formatPct(d.theoretical_cache_hit_rate); + if (gpuHit) parts.push(tooltipLine('GPU Cache Hit Rate', gpuHit)); + if (cpuHit) parts.push(tooltipLine('CPU Cache Hit Rate', cpuHit)); + if (theoHit) parts.push(tooltipLine('Theoretical Cache Hit Rate', theoHit)); + + if (d.num_requests_total !== undefined && d.num_requests_successful !== undefined) { + const successPct = + d.num_requests_total > 0 + ? ` (${((d.num_requests_successful / d.num_requests_total) * 100).toFixed(0)}%)` + : ''; + parts.push( + tooltipLine( + 'Requests', + `${d.num_requests_successful} / ${d.num_requests_total}${successPct}`, + ), + ); + } + + if (d.total_prompt_tokens !== undefined) { + parts.push(tooltipLine('Prompt Tokens', formatNumber(d.total_prompt_tokens))); + } + if (d.total_generation_tokens !== undefined) { + parts.push(tooltipLine('Generated Tokens', formatNumber(d.total_generation_tokens))); + } + + return parts.join(''); +}; + /** * Generates HTML for the parallelism configuration section of a tooltip. * Falls back to GPU count for old data without parallelism fields. @@ -177,9 +222,10 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)} ${runLinkHTML(runUrl)} ${ isPinned @@ -231,9 +277,10 @@ export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): str
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)}
`; }; @@ -292,9 +339,10 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)} ${runLinkHTML(runUrl)}
`; diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx index 75e2f257..1c843e12 100644 --- a/packages/app/src/components/ui/chart-selectors.tsx +++ b/packages/app/src/components/ui/chart-selectors.tsx @@ -19,12 +19,16 @@ import { type Model, type Precision, type Sequence, + type Percentile, + PERCENTILE_OPTIONS, getModelCategory, getModelLabel, + getPercentileLabel, getPrecisionLabel, getSequenceCategory, getSequenceLabel, groupByCategory, + sequenceKind, } from '@/lib/data-mappings'; function DeprecatedLabel({ reason }: { reason: string }) { @@ -167,6 +171,126 @@ export function SequenceSelector({ ); } +interface ScenarioSelectorProps { + id?: string; + value: string; + onChange: (value: Sequence) => void; + availableSequences: string[]; + 'data-testid'?: string; +} + +/** + * Scenario selector — fixed-seq-len rows grouped under "Fixed Sequence Length", + * agentic-trace rows rendered flat below. Label is "Scenario" (the ISL/OSL + * framing only applies to the fixed-seq subset). + */ +export function ScenarioSelector({ + id = 'scenario-select', + value, + onChange, + availableSequences, + 'data-testid': testId, +}: ScenarioSelectorProps) { + const fixedSeq = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'fixed-seq'); + const agentic = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'agentic'); + const fixedGroups = groupByCategory(fixedSeq, (s) => getSequenceCategory(s as Sequence)); + + return ( +
+ + +
+ ); +} + +interface PercentileSelectorProps { + id?: string; + value: string; + onChange: (value: Percentile) => void; + 'data-testid'?: string; +} + +/** + * Latency percentile selector for agentic-trace charts. The selected value + * rewrites the chart x-axis metric from `median_*` to `{percentile}_*`, so + * picking p99 plots p99 e2e latency / interactivity instead of the median. + */ +export function PercentileSelector({ + id = 'percentile-select', + value, + onChange, + 'data-testid': testId, +}: PercentileSelectorProps) { + return ( +
+ + +
+ ); +} + interface PrecisionSelectorProps { id?: string; value: string[]; diff --git a/packages/app/src/components/unofficial-run-provider.test.ts b/packages/app/src/components/unofficial-run-provider.test.ts index f4263d2c..05b522c5 100644 --- a/packages/app/src/components/unofficial-run-provider.test.ts +++ b/packages/app/src/components/unofficial-run-provider.test.ts @@ -29,6 +29,8 @@ function stubRow(overrides: Partial = {}): BenchmarkRow { decode_num_workers: 0, num_prefill_gpu: 8, num_decode_gpu: 8, + benchmark_type: 'single_turn', + offload_mode: 'off', isl: 1024, osl: 1024, conc: 128, diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx index 2dccdf7f..42530a51 100644 --- a/packages/app/src/components/unofficial-run-provider.tsx +++ b/packages/app/src/components/unofficial-run-provider.tsx @@ -12,7 +12,7 @@ import { import type { ChartDefinition, HardwareConfig, InferenceData } from '@/components/inference/types'; import { UnofficialBanner } from '@/components/ui/unofficial-banner'; -import { DB_MODEL_TO_DISPLAY, islOslToSequence } from '@semianalysisai/inferencex-constants'; +import { DB_MODEL_TO_DISPLAY, rowToSequence } from '@semianalysisai/inferencex-constants'; import { computeToggle } from '@/hooks/useTogglableSet'; import type { BenchmarkRow, EvalRow } from '@/lib/api'; import { normalizeEvalHardwareKey } from '@/lib/chart-utils'; @@ -93,7 +93,7 @@ export function buildChartData(benchmarks: BenchmarkRow[]): UnofficialChartData const groups = new Map(); for (const row of benchmarks) { const displayModel = DB_MODEL_TO_DISPLAY[row.model] ?? row.model; - const sequence = islOslToSequence(row.isl, row.osl); + const sequence = rowToSequence(row); if (!sequence) continue; const key = `${displayModel}_${sequence}`; if (!groups.has(key)) groups.set(key, []); diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts index 11ba4521..240251c3 100644 --- a/packages/app/src/lib/api.ts +++ b/packages/app/src/lib/api.ts @@ -23,9 +23,13 @@ export interface BenchmarkRow { decode_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; - isl: number; - osl: number; + benchmark_type: string; + // Null for agentic_traces rows; numeric for single_turn fixed-seq rows. + isl: number | null; + osl: number | null; conc: number; + /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */ + offload_mode: string; image: string | null; metrics: Record; date: string; @@ -140,13 +144,15 @@ export function fetchWorkflowInfo(date: string, signal?: AbortSignal) { export interface AvailabilityRow { model: string; - isl: number; - osl: number; + // Null for agentic_traces rows; numeric for single_turn fixed-seq rows. + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; spec_method: string; disagg: boolean; + benchmark_type: string; date: string; } diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts index be76438e..6a6c97c8 100644 --- a/packages/app/src/lib/benchmark-transform.test.ts +++ b/packages/app/src/lib/benchmark-transform.test.ts @@ -23,6 +23,8 @@ function makeRow(overrides: Partial = {}): BenchmarkRow { decode_num_workers: 0, num_prefill_gpu: 8, num_decode_gpu: 8, + benchmark_type: 'single_turn', + offload_mode: 'off', isl: 1024, osl: 1024, conc: 64, diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts index 107f0b12..69745da2 100644 --- a/packages/app/src/lib/benchmark-transform.ts +++ b/packages/app/src/lib/benchmark-transform.ts @@ -15,9 +15,39 @@ import { createChartDataPoint, getHardwareKey } from '@/lib/chart-utils'; import { getHardwareConfig } from '@/lib/constants'; import type { BenchmarkRow } from '@/lib/api'; +/** + * Agentic trace-replay runs (`benchmark_type === 'agentic_traces'`) emit ttft/ttlt/itl + * but not the intvty/e2el/tpot keys the chart pipeline expects. Bridge them here: + * e2el ≡ ttlt (time-to-last-token == end-to-end latency) + * tpot ≡ itl (time-per-output-token == inter-token-latency for single-output) + * intvty ≡ 1/itl (tok/s from the user's perspective) + * Existing fields win if present; we only fill in the gaps. + */ +function agenticAliases(m: Record): Record { + const out: Record = {}; + for (const suffix of ['mean', 'median', 'p90', 'p99']) { + const itl = m[`${suffix}_itl`]; + const ttlt = m[`${suffix}_ttlt`]; + if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt; + if (m[`${suffix}_tpot`] === undefined && itl !== undefined) out[`${suffix}_tpot`] = itl; + if (m[`${suffix}_intvty`] === undefined && itl !== undefined && itl > 0) { + out[`${suffix}_intvty`] = 1 / itl; + } + } + return out; +} + /** Convert a DB benchmark row to an AggDataEntry. */ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { - const m = row.metrics; + const isAgentic = row.benchmark_type === 'agentic_traces'; + const m = isAgentic ? { ...row.metrics, ...agenticAliases(row.metrics) } : row.metrics; + // Prefer the dedicated column (added in migration 004); fall back to the + // legacy stash inside `metrics` for any rows ingested before that column + // existed. + const rawMetrics = row.metrics as Record; + const offloadMode = + row.offload_mode ?? + (typeof rawMetrics.offload_mode === 'string' ? rawMetrics.offload_mode : undefined); return { hw: row.hardware, framework: row.framework, @@ -68,6 +98,17 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { date: row.date, actualDate: (row as any).actualDate ?? row.date, run_url: row.run_url ?? undefined, + benchmark_type: row.benchmark_type, + isl: row.isl, + osl: row.osl, + offload_mode: offloadMode, + server_gpu_cache_hit_rate: m.server_gpu_cache_hit_rate, + server_cpu_cache_hit_rate: m.server_cpu_cache_hit_rate, + theoretical_cache_hit_rate: m.theoretical_cache_hit_rate, + num_requests_total: m.num_requests_total, + num_requests_successful: m.num_requests_successful, + total_prompt_tokens: m.total_prompt_tokens, + total_generation_tokens: m.total_generation_tokens, }; } @@ -77,13 +118,30 @@ interface PreparedEntry { date: string; } +/** + * Rewrite a chart x-axis key to use a different latency percentile prefix + * (`median_` → `p99_` etc). Only touches keys that start with a known + * percentile prefix; leaves everything else alone. + */ +export function withPercentile(key: string, percentile: string): string { + return key.replace(/^(mean|median|p90|p99|p99\.9)_/, `${percentile}_`); +} + /** * Transform raw BenchmarkRow[] into chart-ready InferenceData[][] and HardwareConfig. * Returns one InferenceData[] per chart definition (e2e, interactivity). * * Converts rows to AggDataEntry once, then reuses for each chart definition. + * + * @param percentile Optional latency percentile for the chart x-axis + * (default 'median'). Swaps `median_intvty`/`median_e2el` in the chart + * definition for the chosen percentile — only agentic rows carry the + * full set (median/p90/p99/p99.9) so this mainly affects that scenario. */ -export function transformBenchmarkRows(rows: BenchmarkRow[]): { +export function transformBenchmarkRows( + rows: BenchmarkRow[], + percentile = 'median', +): { chartData: InferenceData[][]; hardwareConfig: HardwareConfig; } { @@ -109,13 +167,14 @@ export function transformBenchmarkRows(rows: BenchmarkRow[]): { // Phase 2: Build chart data per chart definition (reusing prepared entries) const chartData = (chartDefinitions as ChartDefinition[]).map((chartDef) => { + const xKey = withPercentile(chartDef.x, percentile); const groupedByHw: Record = {}; for (const { entry, hwKey, date } of prepared) { const dataPoint = createChartDataPoint( date, entry, - chartDef.x as keyof AggDataEntry, + xKey as keyof AggDataEntry, chartDef.y as keyof AggDataEntry, hwKey, ); diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts index 823b6823..8900f50e 100644 --- a/packages/app/src/lib/data-mappings.ts +++ b/packages/app/src/lib/data-mappings.ts @@ -102,17 +102,77 @@ export enum Sequence { OneK_OneK = '1k/1k', OneK_EightK = '1k/8k', EightK_OneK = '8k/1k', + AgenticTraces = 'agentic-traces', } -const SEQUENCE_CONFIG: Record = - { - [Sequence.OneK_OneK]: { label: '1K / 1K', compact: '1k1k', category: 'default' }, - [Sequence.OneK_EightK]: { label: '1K / 8K', compact: '1k8k', category: 'deprecated' }, - [Sequence.EightK_OneK]: { label: '8K / 1K', compact: '8k1k', category: 'default' }, - }; +/** + * Top-level scenario kind. Fixed-seq sequences cluster under a single group + * in the selector; agentic traces sit alongside as their own kind. + */ +export type ScenarioKind = 'fixed-seq' | 'agentic'; + +export function sequenceKind(seq: Sequence): ScenarioKind { + return seq === Sequence.AgenticTraces ? 'agentic' : 'fixed-seq'; +} + +const SEQUENCE_CONFIG: Record< + Sequence, + { label: string; compact: string; category: CategoryTag; kind: ScenarioKind } +> = { + [Sequence.OneK_OneK]: { + label: '1K / 1K', + compact: '1k1k', + category: 'default', + kind: 'fixed-seq', + }, + [Sequence.OneK_EightK]: { + label: '1K / 8K', + compact: '1k8k', + category: 'deprecated', + kind: 'fixed-seq', + }, + [Sequence.EightK_OneK]: { + label: '8K / 1K', + compact: '8k1k', + category: 'default', + kind: 'fixed-seq', + }, + [Sequence.AgenticTraces]: { + label: 'Agentic Traces', + compact: 'agentic', + category: 'default', + kind: 'agentic', + }, +}; export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[]; +/** + * Percentile of the latency distribution used for the chart x-axis when + * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants + * for ttft, ttlt (=e2el), and itl (and intvty derived from itl) — pick which + * slice to plot. + */ +export enum Percentile { + Median = 'median', + P90 = 'p90', + P99 = 'p99', + P99_9 = 'p99.9', +} + +const PERCENTILE_CONFIG: Record = { + [Percentile.Median]: { label: 'p50 (median)' }, + [Percentile.P90]: { label: 'p90' }, + [Percentile.P99]: { label: 'p99' }, + [Percentile.P99_9]: { label: 'p99.9' }, +}; + +export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[]; + +export function getPercentileLabel(p: Percentile): string { + return PERCENTILE_CONFIG[p]?.label ?? p; +} + export const DEPRECATED_SEQUENCES: ReadonlySet = new Set( (Object.entries(SEQUENCE_CONFIG) as [Sequence, (typeof SEQUENCE_CONFIG)[Sequence]][]) .filter(([, c]) => c.category === 'deprecated') diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts index 3947488f..fb2e9d70 100644 --- a/packages/app/src/lib/url-state.ts +++ b/packages/app/src/lib/url-state.ts @@ -22,6 +22,7 @@ const URL_STATE_KEYS = [ 'i_seq', 'i_prec', 'i_metric', + 'i_pctl', 'i_xmetric', 'i_e2e_xmetric', 'i_scale', @@ -61,6 +62,7 @@ export const PARAM_DEFAULTS: Record = { i_seq: '8k/1k', i_prec: 'fp4', i_metric: 'y_tpPerGpu', + i_pctl: 'median', i_xmetric: 'p99_ttft', i_e2e_xmetric: '', i_scale: 'auto', diff --git a/packages/constants/src/models.ts b/packages/constants/src/models.ts index 6d646f08..d9a3d2d1 100644 --- a/packages/constants/src/models.ts +++ b/packages/constants/src/models.ts @@ -53,3 +53,20 @@ export function islOslToSequence(isl: number, osl: number): string | null { }; return map[`${isl}_${osl}`] ?? null; } + +/** + * Map a benchmark/availability row to its sequence (scenario) string. + * - `agentic_traces` rows map to `'agentic-traces'` regardless of isl/osl. + * - Other rows (today: `single_turn`) fall back to `islOslToSequence`. + * Returns `null` for rows that can't be classified (e.g. `single_turn` with + * unmapped isl/osl values). + */ +export function rowToSequence(row: { + isl: number | null; + osl: number | null; + benchmark_type: string; +}): string | null { + if (row.benchmark_type === 'agentic_traces') return 'agentic-traces'; + if (row.isl === null || row.osl === null) return null; + return islOslToSequence(row.isl, row.osl); +} diff --git a/packages/db/migrations/002_agentic_scenario.sql b/packages/db/migrations/002_agentic_scenario.sql new file mode 100644 index 00000000..c143914e --- /dev/null +++ b/packages/db/migrations/002_agentic_scenario.sql @@ -0,0 +1,30 @@ +-- Support agentic scenarios in benchmark_results. +-- +-- Scenarios are discriminated by benchmark_type: +-- 'single_turn' — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set. +-- 'agentic_traces' — trace-replay agentic runs. isl/osl NULL. +-- +-- conc retains its meaning (concurrent users/requests) for both. + +-- 1) isl/osl become nullable for agentic rows +alter table benchmark_results + alter column isl drop not null, + alter column osl drop not null; + +-- 2) CHECK constraints: positive-or-null +alter table benchmark_results + drop constraint benchmark_results_isl_positive, + drop constraint benchmark_results_osl_positive; + +alter table benchmark_results + add constraint benchmark_results_isl_positive check (isl is null or isl > 0), + add constraint benchmark_results_osl_positive check (osl is null or osl > 0); + +-- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows +-- can't duplicate on (workflow_run_id, config_id, benchmark_type, conc). +alter table benchmark_results + drop constraint benchmark_results_unique; + +alter table benchmark_results + add constraint benchmark_results_unique unique nulls not distinct + (workflow_run_id, config_id, benchmark_type, isl, osl, conc); diff --git a/packages/db/migrations/003_agentic_availability.sql b/packages/db/migrations/003_agentic_availability.sql new file mode 100644 index 00000000..e96cbd50 --- /dev/null +++ b/packages/db/migrations/003_agentic_availability.sql @@ -0,0 +1,21 @@ +-- Extend the availability table to cover agentic scenarios. +-- +-- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same +-- for availability and add benchmark_type so the frontend can enumerate +-- agentic vs single_turn scenarios per model/date. +-- +-- Postgres primary keys require every column to be NOT NULL, so we drop the PK +-- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally +-- equivalent except it allows isl/osl to be NULL for agentic rows. + +alter table availability + drop constraint availability_pkey; + +alter table availability + alter column isl drop not null, + alter column osl drop not null, + add column benchmark_type text not null default 'single_turn'; + +alter table availability + add constraint availability_natural_key unique nulls not distinct + (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date); diff --git a/packages/db/migrations/004_offload_mode.sql b/packages/db/migrations/004_offload_mode.sql new file mode 100644 index 00000000..24b617f1 --- /dev/null +++ b/packages/db/migrations/004_offload_mode.sql @@ -0,0 +1,42 @@ +-- Add offload_mode as a first-class dimension on benchmark_results. +-- +-- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace +-- runs: a single run may emit two rows for the same (config, isl, osl, conc) +-- — one with offload disabled, one enabled. The pre-existing unique key +-- collapsed those into one row, forcing the ingest to skip variants. +-- +-- For fixed-seq runs `offload_mode` defaults to 'off', which matches the +-- assumption baked into the existing 5,500+ rows. + +alter table benchmark_results + add column offload_mode text not null default 'off'; + +-- Backfill agentic rows from the offload_mode value already living in metrics +-- JSONB (set during the earlier agentic ingest backfill). +update benchmark_results + set offload_mode = metrics->>'offload_mode' + where benchmark_type = 'agentic_traces' + and metrics ? 'offload_mode'; + +-- Replace the unique constraint so on/off variants can coexist. +alter table benchmark_results + drop constraint benchmark_results_unique; + +alter table benchmark_results + add constraint benchmark_results_unique unique nulls not distinct + (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode); + +-- Rebuild the latest-per-config materialized view to dedupe by offload_mode too. +drop materialized view if exists latest_benchmarks cascade; + +create materialized view latest_benchmarks as +select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode) + br.* +from benchmark_results br +join latest_workflow_runs wr on wr.id = br.workflow_run_id +where br.error is null +order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc; + +create unique index latest_benchmarks_pk + on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct; +create index latest_benchmarks_model_idx on latest_benchmarks (config_id); diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts index 67173c64..ea802d3f 100644 --- a/packages/db/src/etl/benchmark-ingest.ts +++ b/packages/db/src/etl/benchmark-ingest.ts @@ -29,12 +29,19 @@ export async function bulkIngestBenchmarkRows( // Postgres rejects ON CONFLICT DO UPDATE if the same conflict key appears // more than once in a single batch. Deduplicate within the batch, keeping - // the last occurrence (last metrics for each unique config/isl/osl/conc). + // the last occurrence (last metrics for each unique config/benchmark_type/isl/osl/conc/offload_mode). const seen = new Map(); - for (const r of rows) seen.set(`${r.configId}-${r.isl}-${r.osl}-${r.conc}`, r); + for (const r of rows) { + seen.set( + `${r.configId}-${r.benchmarkType}-${r.isl ?? ''}-${r.osl ?? ''}-${r.conc}-${r.offloadMode}`, + r, + ); + } const deduped = [...seen.values()]; const configIds = deduped.map((r) => r.configId); + const benchmarkTypes = deduped.map((r) => r.benchmarkType); + const offloadModes = deduped.map((r) => r.offloadMode); const isls = deduped.map((r) => r.isl); const osls = deduped.map((r) => r.osl); const concs = deduped.map((r) => r.conc); @@ -43,20 +50,21 @@ export async function bulkIngestBenchmarkRows( const result = await sql<{ inserted: boolean; id: number }[]>` insert into benchmark_results ( - workflow_run_id, config_id, benchmark_type, date, + workflow_run_id, config_id, benchmark_type, offload_mode, date, isl, osl, conc, image, metrics ) select ${workflowRunId}, unnest(${sql.array(configIds)}::int[]), - 'single_turn', + unnest(${sql.array(benchmarkTypes)}::text[]), + unnest(${sql.array(offloadModes)}::text[]), ${date}::date, unnest(${sql.array(isls)}::int[]), unnest(${sql.array(osls)}::int[]), unnest(${sql.array(concs)}::int[]), unnest(${sql.array(images)}), unnest(${sql.array(metricsJsons)}::jsonb[]) - on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc) + on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode) do update set metrics = excluded.metrics, image = excluded.image @@ -147,13 +155,14 @@ export async function bulkUpsertAvailability( sql: Sql, rows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[], date: string, ): Promise { @@ -162,7 +171,7 @@ export async function bulkUpsertAvailability( const seen = new Set(); const unique: typeof rows = []; for (const r of rows) { - const key = `${r.model}|${r.isl}|${r.osl}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${date}`; + const key = `${r.model}|${r.isl ?? ''}|${r.osl ?? ''}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${r.benchmarkType}|${date}`; if (!seen.has(key)) { seen.add(key); unique.push(r); @@ -170,7 +179,7 @@ export async function bulkUpsertAvailability( } await sql` - insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, date) + insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date) select unnest(${sql.array(unique.map((r) => r.model))}::text[]), unnest(${sql.array(unique.map((r) => r.isl))}::int[]), @@ -180,6 +189,7 @@ export async function bulkUpsertAvailability( unnest(${sql.array(unique.map((r) => r.framework))}::text[]), unnest(${sql.array(unique.map((r) => r.specMethod))}::text[]), unnest(${sql.array(unique.map((r) => r.disagg))}::bool[]), + unnest(${sql.array(unique.map((r) => r.benchmarkType))}::text[]), ${date}::date on conflict do nothing `; diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index 7d78e175..5b120843 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -57,8 +57,21 @@ const NON_METRIC_KEYS = new Set([ 'decode_num_workers', 'num_prefill_gpu', 'num_decode_gpu', + // agentic scenario + 'scenario_type', + 'users', + 'offload_mode', + 'num_requests_total', + 'num_requests_successful', ]); +/** + * `benchmark_type` values understood by the ingest. + * - `single_turn` — fixed sequence-length runs (isl/osl set). + * - `agentic_traces` — trace-replay agentic runs (isl/osl null, `users` → conc). + */ +export type BenchmarkType = 'single_turn' | 'agentic_traces'; + /** * METRIC_KEYS from constants is the canonical set of known metric keys. * Any numeric field outside this set and `NON_METRIC_KEYS` is auto-captured @@ -70,9 +83,13 @@ const _warnedMetricKeys = new Set(); export interface BenchmarkParams { config: ConfigParams; - isl: number; - osl: number; + benchmarkType: BenchmarkType; + // Null for agentic_traces; present for single_turn. + isl: number | null; + osl: number | null; conc: number; + /** 'on' | 'off' — KV cache offload to CPU. Defaults to 'off'. */ + offloadMode: string; image: string | null; metrics: Record; } @@ -114,10 +131,15 @@ export function mapBenchmarkRow( return null; } - const isl = parseInt2(row.isl) ?? islOslFallback?.isl; - const osl = parseInt2(row.osl) ?? islOslFallback?.osl; - const conc = parseInt2(row.conc); - if (!isl || !osl || !conc) { + // Agentic-trace runs emit `scenario_type: 'agentic-coding'` (and variants), + // no isl/osl, and `users` instead of `conc`. Everything else stays as-is. + const isAgentic = String(row.scenario_type ?? '').startsWith('agentic'); + const benchmarkType: BenchmarkType = isAgentic ? 'agentic_traces' : 'single_turn'; + + const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null); + const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null); + const conc = isAgentic ? parseInt2(row.users) : parseInt2(row.conc); + if (!conc || (!isAgentic && (!isl || !osl))) { tracker.skips.noIslOsl++; return null; } @@ -182,6 +204,12 @@ export function mapBenchmarkRow( } } + // Agentic rows emit `offload_mode: "on" | "off"` as a string — preserve it + // as a stringified metric so the frontend can expose it in tooltips. + if (isAgentic && typeof row.offload_mode === 'string') { + (metrics as Record).offload_mode = row.offload_mode; + } + // Artifact names encode '/' as '#' to avoid path separators; restore the URI. const image = row.image ? String(row.image).replaceAll('#', '/') : null; @@ -205,9 +233,14 @@ export function mapBenchmarkRow( numPrefillGpu, numDecodeGpu, }, + benchmarkType, isl, osl, conc, + offloadMode: + typeof row.offload_mode === 'string' && row.offload_mode.length > 0 + ? row.offload_mode + : 'off', image, metrics, }; diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index 14c7b4d0..8cce43ca 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -248,13 +248,14 @@ async function main(): Promise { const availRows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[] = []; let totalNewBmk = 0, @@ -367,6 +368,7 @@ async function main(): Promise { framework: r.config.framework, specMethod: r.config.specMethod, disagg: r.config.disagg, + benchmarkType: r.benchmarkType, }); } diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts index e20278d6..6dc604e9 100644 --- a/packages/db/src/ingest-gcs-backup.ts +++ b/packages/db/src/ingest-gcs-backup.ts @@ -596,13 +596,14 @@ async function main(): Promise { // Upsert availability rows only for successfully resolved configs const availRows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[] = []; for (const r of allInserted) { availRows.push({ @@ -614,6 +615,7 @@ async function main(): Promise { framework: r.config.framework, specMethod: r.config.specMethod, disagg: r.config.disagg, + benchmarkType: r.benchmarkType, }); } if (availRows.length > 0) { diff --git a/packages/db/src/ingest-supplemental.ts b/packages/db/src/ingest-supplemental.ts index 1e494e9f..43aae047 100644 --- a/packages/db/src/ingest-supplemental.ts +++ b/packages/db/src/ingest-supplemental.ts @@ -219,8 +219,10 @@ async function ingestSupplementalBmk( const rows: { configId: number; - isl: number; - osl: number; + benchmarkType: 'single_turn' | 'agentic_traces'; + offloadMode: string; + isl: number | null; + osl: number | null; conc: number; image: string | null; metrics: Record; @@ -271,6 +273,8 @@ async function ingestSupplementalBmk( rows.push({ configId, + benchmarkType: 'single_turn', + offloadMode: 'off', isl: entry.isl, osl: entry.osl, conc: entry.conc, @@ -294,13 +298,14 @@ async function ingestSupplementalBmk( // to `rows` are exactly the valid ones. const availRows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[] = []; for (const entry of entries) { const modelKey = resolveModelKey({ model: entry.model, infmax_model_prefix: undefined }); @@ -317,6 +322,7 @@ async function ingestSupplementalBmk( framework, specMethod, disagg, + benchmarkType: 'single_turn', }); } if (availRows.length > 0) { diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts index 0d9373d3..f09a2686 100644 --- a/packages/db/src/json-provider.ts +++ b/packages/db/src/json-provider.ts @@ -290,6 +290,8 @@ function toBenchmarkRow( decode_num_workers: c.decode_num_workers, num_prefill_gpu: c.num_prefill_gpu, num_decode_gpu: c.num_decode_gpu, + benchmark_type: br.benchmark_type ?? 'single_turn', + offload_mode: (br as { offload_mode?: string }).offload_mode ?? 'off', isl: br.isl, osl: br.osl, conc: br.conc, @@ -410,7 +412,11 @@ export function getAvailabilityData(): AvailabilityRow[] { for (const a of s.availability) { const key = `${a.model}|${a.hardware}|${a.framework}|${a.precision}|${a.isl}|${a.osl}|${toDateString(a.date)}`; if (validKeys.has(key)) { - rows.push({ ...a, date: toDateString(a.date) }); + rows.push({ + ...a, + benchmark_type: (a as { benchmark_type?: string }).benchmark_type ?? 'single_turn', + date: toDateString(a.date), + }); } } diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts index 1c30b1fd..74e20380 100644 --- a/packages/db/src/queries/benchmarks.ts +++ b/packages/db/src/queries/benchmarks.ts @@ -18,9 +18,13 @@ export interface BenchmarkRow { decode_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; - isl: number; - osl: number; + benchmark_type: string; + // Null for agentic_traces; numeric for single_turn fixed-seq runs. + isl: number | null; + osl: number | null; conc: number; + /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */ + offload_mode: string; image: string | null; metrics: Record; date: string; @@ -68,6 +72,8 @@ export async function getLatestBenchmarks( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, + br.offload_mode, br.isl, br.osl, br.conc, @@ -106,6 +112,8 @@ export async function getLatestBenchmarks( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + lb.benchmark_type, + lb.offload_mode, lb.isl, lb.osl, lb.conc, @@ -153,6 +161,7 @@ export async function getAllBenchmarksForHistory( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, br.isl, br.osl, br.conc, diff --git a/packages/db/src/queries/workflow-info.ts b/packages/db/src/queries/workflow-info.ts index b4e4f255..d5e2d933 100644 --- a/packages/db/src/queries/workflow-info.ts +++ b/packages/db/src/queries/workflow-info.ts @@ -88,20 +88,22 @@ export async function getDateConfigs(sql: DbClient, date: string): Promise { const rows = await sql` - SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.date::text + SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.benchmark_type, a.date::text FROM availability a WHERE EXISTS ( SELECT 1 @@ -112,8 +114,9 @@ export async function getAvailabilityData(sql: DbClient): Promise Date: Thu, 30 Apr 2026 19:01:56 -0500 Subject: [PATCH 02/55] =?UTF-8?q?fix:=20agentic=20offload=20variants=20?= =?UTF-8?q?=E2=80=94=20render=20both=20halos=20+=20map=20renamed=20fields?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ScatterGraph: include `offload_mode` in `buildPointConfigId` so d3's data join keeps both `on` and `off` variants for the same (config, conc). Without it, the second variant collapsed onto the first key, so FP8 offload-on points (and their halos) silently disappeared. - benchmark-mapper: handle older artifacts that emit `users`/`offload_mode` AND newer ones that emit `conc`/`offloading` (with 'none' → 'off' mapping). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/inference/ui/ScatterGraph.tsx | 4 +++ packages/db/src/etl/benchmark-mapper.ts | 27 ++++++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 15bb60f0..55a206ce 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -295,6 +295,10 @@ const ScatterGraph = React.memo( const buildPointConfigId = useCallback((point: InferenceData): string => { let key = `${point.hwKey}|${point.precision}|${point.tp}|${point.conc}|${point.decode_ep ?? 0}|${point.prefill_tp ?? 0}|${point.prefill_ep ?? 0}`; if (point.disagg) key += `|disagg|${point.num_prefill_gpu ?? 0}|${point.num_decode_gpu ?? 0}`; + // Agentic runs emit two rows per (config, conc) — one offload=on, one off. + // Without this suffix, d3's data join treats them as the same point and + // drops one variant (along with its halo). + if (point.offload_mode) key += `|offload-${point.offload_mode}`; return key; }, []); diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index 5b120843..d842276e 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -138,12 +138,24 @@ export function mapBenchmarkRow( const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null); const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null); - const conc = isAgentic ? parseInt2(row.users) : parseInt2(row.conc); + // Agentic artifacts encode concurrency as `users` in older schemas and `conc` in newer ones. + const conc = isAgentic ? (parseInt2(row.users) ?? parseInt2(row.conc)) : parseInt2(row.conc); if (!conc || (!isAgentic && (!isl || !osl))) { tracker.skips.noIslOsl++; return null; } + // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading` + // ('none' → 'off'; any other non-empty value → 'on'). + const offloadModeRaw = + typeof row.offload_mode === 'string' && row.offload_mode.length > 0 + ? row.offload_mode + : typeof row.offloading === 'string' && row.offloading.length > 0 + ? row.offloading === 'none' + ? 'off' + : 'on' + : 'off'; + const { framework, disagg } = normalizeFramework(String(row.framework ?? ''), row.disagg); const isMultinode = parseBool(row.is_multinode); const precision = normalizePrecision(String(row.precision ?? '')); @@ -204,10 +216,10 @@ export function mapBenchmarkRow( } } - // Agentic rows emit `offload_mode: "on" | "off"` as a string — preserve it - // as a stringified metric so the frontend can expose it in tooltips. - if (isAgentic && typeof row.offload_mode === 'string') { - (metrics as Record).offload_mode = row.offload_mode; + // Agentic rows emit `offload_mode: "on" | "off"` (or older `offloading: "none"|...`) + // — preserve as a stringified metric so the frontend can expose it in tooltips. + if (isAgentic) { + (metrics as Record).offload_mode = offloadModeRaw; } // Artifact names encode '/' as '#' to avoid path separators; restore the URI. @@ -237,10 +249,7 @@ export function mapBenchmarkRow( isl, osl, conc, - offloadMode: - typeof row.offload_mode === 'string' && row.offload_mode.length > 0 - ? row.offload_mode - : 'off', + offloadMode: offloadModeRaw, image, metrics, }; From 07ba10636dae87b5a819afa524d7c10322fae41b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 00:29:55 -0500 Subject: [PATCH 03/55] fix: render offload halo on every offload-on point, not just frontier The halo's purpose is to surface KV-offload usage; restricting it to Pareto-frontier-only points hid the indicator on most runs. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/app/src/components/inference/ui/ScatterGraph.tsx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 55a206ce..61ac0983 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -1516,10 +1516,9 @@ const ScatterGraph = React.memo( .attr('pointer-events', 'none'); }); - // Offload halo: dashed ring on frontier points that used KV offload + // Offload halo: dashed ring on every point that used KV offload (Pareto or not) zoomGroup.selectAll('.dot-group').each(function (d) { - const onFrontier = optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`); - const showHalo = onFrontier && d.offload_mode === 'on'; + const showHalo = d.offload_mode === 'on'; d3.select(this) .selectAll('.offload-halo') .data(showHalo ? [true] : []) From 95e9dc77431adf5354ef0df36989816199624383 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 01:13:42 -0500 Subject: [PATCH 04/55] fix: strip runner-pool suffix (-p1, -p2, ...) from hw identifier b300-p1 (and similar) artifacts were skipping ingest because the runner-pool suffix wasn't in the strip list and didn't normalize to the canonical b300 GPU key. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/db/src/etl/normalizers.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts index ad12a454..bd497f7a 100644 --- a/packages/db/src/etl/normalizers.ts +++ b/packages/db/src/etl/normalizers.ts @@ -34,7 +34,8 @@ export function hwToGpuKey(hw: string): string | null { .replace(/-dgxc-slurm$/, '') .replace(/-dgxc$/, '') .replace(/-nb$/, '') - .replace(/-nv$/, ''); + .replace(/-nv$/, '') + .replace(/-p\d+$/, ''); // strip runner-pool suffix (e.g. b300-p1 → b300) return GPU_KEYS.has(base) ? base : null; } From 982106da5f4421983841304f0503b6467033852d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 09:25:33 -0500 Subject: [PATCH 05/55] feat: bold scatter labels with concurrency tag + collision avoidance - Label text now includes `C=` alongside the GPU/parallelism tag (default ` C=`, advanced ` C=`) - Bumped point-label font-weight to 700 so the labels read clearly against the chart fill - Greedy collision-avoidance pass on render and zoom: tries placing each label above/below the point through 4 candidate dy offsets, hiding the label only when no slot is free Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/inference/ui/ScatterGraph.tsx | 68 ++++++++++++++++++- .../src/lib/d3-chart/layers/scatter-points.ts | 1 + 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 61ac0983..3fbd8588 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -55,6 +55,63 @@ import { buildGradientColorMap, } from '@/components/inference/utils/paretoLabels'; +// Greedy label-collision avoidance: try positions above/below the point; +// hide labels that can't fit anywhere. Re-runs cheaply on each render/zoom. +function avoidLabelCollisions( + zoomGroup: d3.Selection, +): void { + const labels: { + el: SVGTextElement; + cx: number; + cy: number; + w: number; + h: number; + }[] = []; + zoomGroup.selectAll('.dot-group').each(function () { + const labelEl = this.querySelector('.point-label'); + if (!labelEl) return; + if ((this as SVGGElement).style.opacity === '0') return; + const transform = (this as SVGGElement).getAttribute('transform') ?? ''; + const m = transform.match(/translate\(([^,]+),([^)]+)\)/); + if (!m) return; + const cx = parseFloat(m[1]); + const cy = parseFloat(m[2]); + labelEl.setAttribute('dy', '-8'); + labelEl.style.opacity = '1'; + const bbox = labelEl.getBBox(); + labels.push({ el: labelEl, cx, cy, w: bbox.width, h: bbox.height }); + }); + labels.sort((a, b) => a.cx - b.cx); + const placed: { left: number; right: number; top: number; bottom: number }[] = []; + const pad = 1; + const candidates = [-8, 14, -22, 28]; + for (const lab of labels) { + let chosenDy: number | null = null; + let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null; + for (const dy of candidates) { + const top = lab.cy + dy - lab.h - pad; + const bottom = lab.cy + dy + pad; + const left = lab.cx - lab.w / 2 - pad; + const right = lab.cx + lab.w / 2 + pad; + const collides = placed.some( + (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom), + ); + if (!collides) { + chosenDy = dy; + chosenBox = { left, right, top, bottom }; + break; + } + } + if (chosenDy !== null && chosenBox) { + lab.el.setAttribute('dy', String(chosenDy)); + lab.el.style.opacity = '1'; + placed.push(chosenBox); + } else { + lab.el.style.opacity = '0'; + } + } +} + // X-shape path for overlay (unofficial) data points const X_SIZE = 5; const X_HOVER_SIZE = 7; @@ -603,6 +660,7 @@ const ScatterGraph = React.memo( d3.axisLeft(newYS).ticks(10).tickFormat(logTickFormat(newYS)) as any, ); } + avoidLabelCollisions(ctx.layout.zoomGroup); }, }), [zoomResetEventName, eventPrefix, xScaleConfig._isLog, yScaleConfig.type], @@ -1251,7 +1309,8 @@ const ScatterGraph = React.memo( getOpacity: (d) => (isPointVisible(d) ? 1 : 0), getPointerEvents: (d) => (isPointVisible(d) ? 'auto' : 'none'), hideLabels: hidePointLabels || showGradientLabels, - getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)), + getLabelText: (d) => + useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`, foreground: 'var(--foreground)', dataAttrs: { 'hw-key': (d) => String(d.hwKey), @@ -1353,8 +1412,11 @@ const ScatterGraph = React.memo( .attr('text-anchor', 'middle') .style('fill', 'var(--foreground)') .attr('font-size', '10px') + .attr('font-weight', '700') .attr('pointer-events', 'none') - .text(useAdvancedLabels ? getPointLabel(d) : String(d.tp)); + .text( + useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`, + ); }); // Overlay tooltip handlers @@ -1566,6 +1628,8 @@ const ScatterGraph = React.memo( }); }); + avoidLabelCollisions(zoomGroup); + // Log tick formatting on initial render if (xScaleConfig._isLog) { const xScale = ctx.xScale as d3.ScaleLogarithmic; diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts index 507654e1..9f2d2f38 100644 --- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts +++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts @@ -72,6 +72,7 @@ export function renderScatterPoints Date: Fri, 1 May 2026 09:32:44 -0500 Subject: [PATCH 06/55] fix: stack multi-line point labels upward so they don't overlap the point MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tspans now ride above the text's `dy` anchor — the LAST line sits at the anchor (just above the point) and earlier lines stack above it. Previously the second tspan landed below the anchor and crashed into the marker. Also widened collision candidates by label height so the flipped-below position fully clears the point on multi-line labels. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/inference/ui/ScatterGraph.tsx | 28 +++++++--- .../src/lib/d3-chart/layers/scatter-points.ts | 52 +++++++++++++------ 2 files changed, 58 insertions(+), 22 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 3fbd8588..f8ce9b8f 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -84,8 +84,11 @@ function avoidLabelCollisions( labels.sort((a, b) => a.cx - b.cx); const placed: { left: number; right: number; top: number; bottom: number }[] = []; const pad = 1; - const candidates = [-8, 14, -22, 28]; for (const lab of labels) { + // Candidates scale with the label's own height so multi-line labels don't + // overlap the point shape when flipped below. + const below = lab.h + 8; + const candidates = [-8, below, -8 - below - 4, 2 * below]; let chosenDy: number | null = null; let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null; for (const dy of candidates) { @@ -1310,7 +1313,7 @@ const ScatterGraph = React.memo( getPointerEvents: (d) => (isPointVisible(d) ? 'auto' : 'none'), hideLabels: hidePointLabels || showGradientLabels, getLabelText: (d) => - useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`, + useAdvancedLabels ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`, foreground: 'var(--foreground)', dataAttrs: { 'hw-key': (d) => String(d.hwKey), @@ -1403,7 +1406,14 @@ const ScatterGraph = React.memo( // Labels const showLabels = !hidePointLabels && !showGradientLabels; overlayPoints.each(function (d) { - d3.select(this) + const lines = showLabels + ? (useAdvancedLabels + ? `${getPointLabel(d)}\nC=${d.conc}` + : `${d.tp}\nC=${d.conc}` + ).split('\n') + : []; + const text = d3 + .select(this) .selectAll('.overlay-label') .data(showLabels ? [true] : []) .join('text') @@ -1413,10 +1423,14 @@ const ScatterGraph = React.memo( .style('fill', 'var(--foreground)') .attr('font-size', '10px') .attr('font-weight', '700') - .attr('pointer-events', 'none') - .text( - useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`, - ); + .attr('pointer-events', 'none'); + text + .selectAll('tspan') + .data(lines) + .join('tspan') + .attr('x', 0) + .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em')) + .text((l) => l); }); // Overlay tooltip handlers diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts index 9f2d2f38..13c588d8 100644 --- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts +++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts @@ -63,18 +63,30 @@ export function renderScatterPoints` element — the + // intra-stack offsets stay correct whether the label ends up above or below. if (!config.hideLabels && config.getLabelText && config.foreground) { - entered - .append('text') - .attr('class', 'point-label') - .attr('dy', -8) - .attr('text-anchor', 'middle') - .attr('fill', config.foreground) - .attr('font-size', '10px') - .attr('font-weight', '700') - .attr('pointer-events', 'none') - .text(config.getLabelText); + const labelGetter = config.getLabelText; + entered.each(function (d) { + const lines = labelGetter(d).split('\n'); + const text = d3 + .select(this) + .append('text') + .attr('class', 'point-label') + .attr('dy', -8) + .attr('text-anchor', 'middle') + .attr('fill', config.foreground!) + .attr('font-size', '10px') + .attr('font-weight', '700') + .attr('pointer-events', 'none'); + lines.forEach((line, i) => { + const tspanDy = i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'; + text.append('tspan').attr('x', 0).attr('dy', tspanDy).text(line); + }); + }); } // Exit: remove stale points @@ -103,9 +115,12 @@ export function renderScatterPoints('.point-label') + const lines = labelGetter(d).split('\n'); + const text = d3 + .select(this) + .selectAll('.point-label') .data([true]) .join('text') .attr('class', 'point-label') @@ -113,8 +128,15 @@ export function renderScatterPoints('tspan') + .data(lines) + .join('tspan') + .attr('x', 0) + .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em')) + .text((l) => l); }); } else { points.selectAll('.point-label').remove(); From 37eecc6e28c10751ffc52c8a0d0588177e43d4d8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 09:38:39 -0500 Subject: [PATCH 07/55] fix: anchor multi-line labels via first tspan + tspan-aware collision pass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a `` contains tspans, the parent's `dy` does not shift the bbox cleanly — its (unused) y=0 origin still factors in, so the rendered text ended up centered on the point. Move the absolute offset into the FIRST tspan's `dy`; later tspans cascade by 1.1em. Collision avoidance now drives the first tspan's `dy` and tries four candidate baselines (primary above, primary below, secondary above, secondary below), accounting for full label height when picking a non- overlapping slot. Labels still hidden as a last resort. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/inference/ui/ScatterGraph.tsx | 72 +++++++++++++------ .../src/lib/d3-chart/layers/scatter-points.ts | 25 ++++--- 2 files changed, 66 insertions(+), 31 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index f8ce9b8f..27d3680c 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -55,58 +55,88 @@ import { buildGradientColorMap, } from '@/components/inference/utils/paretoLabels'; -// Greedy label-collision avoidance: try positions above/below the point; -// hide labels that can't fit anywhere. Re-runs cheaply on each render/zoom. +// Greedy label-collision avoidance. +// Each candidate is the y-position of the FIRST baseline (relative to point +// center) which we apply via the first tspan's `dy` — later tspans cascade +// down by 1.1em. We try above/below at primary and secondary offsets, and +// hide the label if all four positions collide. function avoidLabelCollisions( zoomGroup: d3.Selection, ): void { - const labels: { + interface LabelInfo { el: SVGTextElement; + firstTspan: SVGTSpanElement; cx: number; cy: number; w: number; - h: number; - }[] = []; + nLines: number; + defaultFirstY: number; + } + const labels: LabelInfo[] = []; + const ASCENT = 9; + const DESCENT = 3; + const LINE_H = 11; + zoomGroup.selectAll('.dot-group').each(function () { const labelEl = this.querySelector('.point-label'); if (!labelEl) return; if ((this as SVGGElement).style.opacity === '0') return; + const tspans = labelEl.querySelectorAll('tspan'); + if (tspans.length === 0) return; const transform = (this as SVGGElement).getAttribute('transform') ?? ''; const m = transform.match(/translate\(([^,]+),([^)]+)\)/); if (!m) return; const cx = parseFloat(m[1]); const cy = parseFloat(m[2]); - labelEl.setAttribute('dy', '-8'); + const nLines = tspans.length; + const defaultFirstY = -(8 + (nLines - 1) * LINE_H); // last baseline 8px above point + // Reset to default before measuring so prior positioning doesn't bias bbox + tspans[0].setAttribute('dy', `${defaultFirstY}px`); labelEl.style.opacity = '1'; const bbox = labelEl.getBBox(); - labels.push({ el: labelEl, cx, cy, w: bbox.width, h: bbox.height }); + labels.push({ + el: labelEl, + firstTspan: tspans[0], + cx, + cy, + w: bbox.width, + nLines, + defaultFirstY, + }); }); + labels.sort((a, b) => a.cx - b.cx); const placed: { left: number; right: number; top: number; bottom: number }[] = []; - const pad = 1; + const pad = 2; + for (const lab of labels) { - // Candidates scale with the label's own height so multi-line labels don't - // overlap the point shape when flipped below. - const below = lab.h + 8; - const candidates = [-8, below, -8 - below - 4, 2 * below]; - let chosenDy: number | null = null; + const blockH = (lab.nLines - 1) * LINE_H + ASCENT + DESCENT; + const aboveFirstY = lab.defaultFirstY; + const belowFirstY = 14; // first baseline 14px below point center + const candidates = [ + aboveFirstY, + belowFirstY, + aboveFirstY - blockH - 2, + belowFirstY + blockH + 2, + ]; + let chosenY: number | null = null; let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null; - for (const dy of candidates) { - const top = lab.cy + dy - lab.h - pad; - const bottom = lab.cy + dy + pad; + for (const firstY of candidates) { + const top = lab.cy + firstY - ASCENT - pad; + const bottom = lab.cy + firstY + (lab.nLines - 1) * LINE_H + DESCENT + pad; const left = lab.cx - lab.w / 2 - pad; const right = lab.cx + lab.w / 2 + pad; const collides = placed.some( (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom), ); if (!collides) { - chosenDy = dy; + chosenY = firstY; chosenBox = { left, right, top, bottom }; break; } } - if (chosenDy !== null && chosenBox) { - lab.el.setAttribute('dy', String(chosenDy)); + if (chosenY !== null && chosenBox) { + lab.firstTspan.setAttribute('dy', `${chosenY}px`); lab.el.style.opacity = '1'; placed.push(chosenBox); } else { @@ -1418,18 +1448,18 @@ const ScatterGraph = React.memo( .data(showLabels ? [true] : []) .join('text') .attr('class', 'overlay-label') - .attr('dy', -10) .attr('text-anchor', 'middle') .style('fill', 'var(--foreground)') .attr('font-size', '10px') .attr('font-weight', '700') .attr('pointer-events', 'none'); + const firstDy = -(1 + (lines.length - 1) * 1.1); text .selectAll('tspan') .data(lines) .join('tspan') .attr('x', 0) - .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em')) + .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em')) .text((l) => l); }); diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts index 13c588d8..71d1f050 100644 --- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts +++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts @@ -64,10 +64,10 @@ export function renderScatterPoints` element — the - // intra-stack offsets stay correct whether the label ends up above or below. + // we anchor the entire stack via the FIRST tspan's `dy` so getBBox() doesn't + // pick up the text element's own (unused) y=0 origin. The first tspan is + // raised so the LAST line baseline lands ~8px above the point; subsequent + // tspans cascade down by 1.1em. if (!config.hideLabels && config.getLabelText && config.foreground) { const labelGetter = config.getLabelText; entered.each(function (d) { @@ -76,15 +76,18 @@ export function renderScatterPoints { - const tspanDy = i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'; - text.append('tspan').attr('x', 0).attr('dy', tspanDy).text(line); + text + .append('tspan') + .attr('x', 0) + .attr('dy', i === 0 ? `${firstDy}em` : '1.1em') + .text(line); }); }); } @@ -113,7 +116,9 @@ export function renderScatterPoints('tspan') .data(lines) .join('tspan') .attr('x', 0) - .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em')) + .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em')) .text((l) => l); }); } else { From f317377dfaea35f9cb5dc435ea177966aa17fbf8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 10:21:00 -0500 Subject: [PATCH 08/55] fix: dedupe artifacts by logical name + skip 0-successful agg rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two complementary fixes for runs whose `results_bmk` aggregated artifact ends up containing both a successful row and a failed-attempt row for the same (config, conc, offload) — the failed row's null metrics were overwriting the good row via ON CONFLICT DO UPDATE. 1. Artifact-level: strip the trailing `__` suffix from each artifact name and group by the logical name, keeping only the most recent per group. 2. Row-level: skip rows with `num_requests_successful === 0` AND `num_requests_total > 0`. The aggregated artifact merges rows from all runners — including failed ones — so artifact-level dedup alone can't reach inside it. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/db/src/etl/benchmark-mapper.ts | 14 +++++++++++ packages/db/src/etl/skip-tracker.ts | 10 +++++++- packages/db/src/ingest-ci-run.ts | 33 ++++++++++++++++++++----- packages/db/src/ingest-gcs-backup.ts | 1 + 4 files changed, 51 insertions(+), 7 deletions(-) diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index d842276e..1aff5ea9 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -145,6 +145,20 @@ export function mapBenchmarkRow( return null; } + // Failed-run guard: aggregated artifacts (`results_bmk`) merge rows from + // every runner, including ones with 0 successful requests and null metrics. + // Without this skip, the empty row's nulls overwrite a good row via + // ON CONFLICT DO UPDATE when both share the same (config, conc, offload). + if ( + typeof row.num_requests_successful === 'number' && + row.num_requests_successful === 0 && + typeof row.num_requests_total === 'number' && + row.num_requests_total > 0 + ) { + tracker.skips.failedRun++; + return null; + } + // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading` // ('none' → 'off'; any other non-empty value → 'on'). const offloadModeRaw = diff --git a/packages/db/src/etl/skip-tracker.ts b/packages/db/src/etl/skip-tracker.ts index 6166ea44..588718dd 100644 --- a/packages/db/src/etl/skip-tracker.ts +++ b/packages/db/src/etl/skip-tracker.ts @@ -8,6 +8,7 @@ export interface Skips { unmappedModel: number; unmappedHw: number; noIslOsl: number; + failedRun: number; dbError: number; } @@ -66,7 +67,14 @@ const MAX_DB_ERRORS = 10; * @returns A `SkipTracker` with zeroed counters and empty unmapped-name sets. */ export function createSkipTracker(): SkipTracker { - const skips: Skips = { badZip: 0, unmappedModel: 0, unmappedHw: 0, noIslOsl: 0, dbError: 0 }; + const skips: Skips = { + badZip: 0, + unmappedModel: 0, + unmappedHw: 0, + noIslOsl: 0, + failedRun: 0, + dbError: 0, + }; const unmappedModels = new Set(); const unmappedHws = new Set(); const unmappedPrecisions = new Set(); diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index 8cce43ca..fb1fbbbc 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -101,15 +101,30 @@ if (isDownloadMode) { } catch {} } - const byName = new Map(); + // Strip the trailing `__` token from each + // artifact name, then group by the resulting logical name and keep only + // the most recent per group. Without this, two artifacts produced on + // different runners for the same logical config (e.g. `…_h200-cw_00` and + // `…_h200-dgxc-slurm_1`) both land in the DB and the failed one's empty + // metrics can overwrite the good one via ON CONFLICT DO UPDATE. + // + // The runner pool name itself has no underscores (`h200-cw`, + // `h200-dgxc-slurm`, `b200-nb`), so `[a-zA-Z0-9.-]*` keeps the strip + // bounded — using `\w` here would over-match across earlier `_` + // separators and collapse different (conc, offload) variants into the + // same logical name. + const RUNNER_SUFFIX_RE = /_[a-zA-Z][a-zA-Z0-9.-]*_\d+$/; + const byLogical = new Map(); for (const a of allArtifacts) { - const existing = byName.get(a.name); + const key = a.name.replace(RUNNER_SUFFIX_RE, ''); + const existing = byLogical.get(key); if (!existing || a.created_at > existing.created_at) { - byName.set(a.name, a); + byLogical.set(key, a); } } - for (const [name, artifact] of byName) { + for (const [, artifact] of byLogical) { + const name = artifact.name; console.log(` ${name}`); const zipPath = path.join(artifactsDir, 'artifact.zip'); execSync(`gh api "${artifact.archive_download_url}" > "${zipPath}"`, { @@ -121,7 +136,7 @@ if (isDownloadMode) { fs.unlinkSync(zipPath); } - console.log(`\n Downloaded ${byName.size} artifact(s)`); + console.log(`\n Downloaded ${byLogical.size} artifact(s)`); // Fetch run attempt from API const attemptStr = execSync( @@ -510,11 +525,17 @@ async function main(): Promise { const { skips, unmappedModels, unmappedHws, unmappedPrecisions } = tracker; const totalSkips = - skips.badZip + skips.unmappedModel + skips.unmappedHw + skips.noIslOsl + skips.dbError; + skips.badZip + + skips.unmappedModel + + skips.unmappedHw + + skips.noIslOsl + + skips.failedRun + + skips.dbError; if (totalSkips > 0) { console.log(`\n Skipped: ${totalSkips} rows`); const skipLines: [string, number][] = [ ['no isl/osl (old format)', skips.noIslOsl], + ['failed run (0 successful)', skips.failedRun], ['unmapped model', skips.unmappedModel], ['unmapped hw', skips.unmappedHw], ['bad/empty zip', skips.badZip], diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts index 6dc604e9..d67f5164 100644 --- a/packages/db/src/ingest-gcs-backup.ts +++ b/packages/db/src/ingest-gcs-backup.ts @@ -434,6 +434,7 @@ async function mapWorkflowDir( unmappedModel: local.skips.unmappedModel, unmappedHw: local.skips.unmappedHw, noIslOsl: local.skips.noIslOsl, + failedRun: local.skips.failedRun, }, localUnmappedModels: new Set(local.unmappedModels), localUnmappedHws: new Set(local.unmappedHws), From c2f66f62f5a1dedb6a87c7c5e58ca990b3cb0956 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 7 May 2026 08:41:26 -0500 Subject: [PATCH 09/55] feat: add AIPerf to FRAMEWORK_LABELS Tag display name for the `aiperf` spec_method suffix used by the alternate-harness runs ingested for the agentic minimax sweep. Without this entry the legend shows 'AIPERF' from the default toUpperCase fallback. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/constants/src/framework-aliases.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/constants/src/framework-aliases.ts b/packages/constants/src/framework-aliases.ts index cc5eb6b4..e23a93bc 100644 --- a/packages/constants/src/framework-aliases.ts +++ b/packages/constants/src/framework-aliases.ts @@ -44,6 +44,7 @@ export const FRAMEWORK_LABELS: Record = { ]), ), mtp: 'MTP', + aiperf: 'AIPerf', }; /** From 024797a978a2a6e2954f66a963de3205b62a149e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 12 May 2026 15:02:07 -0500 Subject: [PATCH 10/55] fix(changelog): coerce ids to string when filtering changelog by run bigint workflow_run_id sometimes deserializes as a number on the frontend depending on the postgres adapter's behavior; strict === between a number and a string silently dropped every match, so the changelog popover always reported "no changelog data available." Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/app/src/components/GlobalFilterContext.tsx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index 08fc7094..11e56de7 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -87,7 +87,9 @@ function buildRunInfo(data: WorkflowInfoResponse): Record { const runs: Record = {}; for (const run of data.runs) { const runId = String(run.github_run_id); - const runChangelogs = data.changelogs.filter((c) => c.workflow_run_id === run.github_run_id); + const runChangelogs = data.changelogs.filter( + (c) => String(c.workflow_run_id) === String(run.github_run_id), + ); runs[runId] = { runId, runDate: run.created_at, From aa154193dfbc12535f25444cdf6fccc16a3e1382 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 12 May 2026 15:36:57 -0500 Subject: [PATCH 11/55] feat: default sequence to Agentic Traces when available If the selected model has agentic_traces data, prefer that over the default 8K/1K fixed-seq when the user hasn't explicitly chosen via URL. effectiveSequence already falls back to availableSequences[0] for models without agentic, so models with only fixed-seq data still render correctly. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/app/src/components/GlobalFilterContext.tsx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index 11e56de7..7813d079 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -125,7 +125,9 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) { const [selectedSequence, setSelectedSequence] = useState(() => { const urlSeq = getUrlParam('i_seq'); if (urlSeq && Object.values(Sequence).includes(urlSeq as Sequence)) return urlSeq as Sequence; - return Sequence.EightK_OneK; + // Prefer Agentic Traces by default when the selected model has it; the + // effectiveSequence fallback below handles models without agentic data. + return Sequence.AgenticTraces; }); const [selectedPrecisions, setSelectedPrecisionsRaw] = useState(() => { From 099a33efcb53f5130dc40d715a0f4b86d6136a93 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 15 May 2026 12:25:25 -0500 Subject: [PATCH 12/55] fix(agentic): respect percentile selector for input-throughput x axis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rowToAggDataEntry was only copying median/p99 metric variants — picking p90/p99.9 in the percentile selector silently fell back to 0 and collapsed every point into a vertical line at x=0. Copy the full median/p90/p99/p99.9 set into AggDataEntry. Hide the X-Axis Metric dropdown for agentic mode (it doubled up with the percentile selector) and route the input-metric chart through withPercentile so picking p99 actually plots p99_ttft instead of the hard-coded p99_ttft config default. Percentile options pared back to median + p99. --- .../inference/hooks/useChartData.ts | 46 +++++++++++++++++-- .../app/src/components/inference/types.ts | 10 ++++ .../components/inference/ui/ChartControls.tsx | 3 +- packages/app/src/lib/benchmark-transform.ts | 12 ++++- packages/app/src/lib/data-mappings.ts | 8 +--- packages/app/src/lib/energy-metrics.test.ts | 10 ++++ 6 files changed, 77 insertions(+), 12 deletions(-) diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index 81ab0780..57e9a1c2 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -16,7 +16,7 @@ import { filterDataByCostLimit } from '@/components/inference/utils'; import { useBenchmarks, benchmarkQueryOptions } from '@/hooks/api/use-benchmarks'; import { GPU_ALIAS_TO_CANONICAL, getModelSortIndex } from '@/lib/constants'; import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform'; -import type { Model, Sequence } from '@/lib/data-mappings'; +import { Sequence, type Model } from '@/lib/data-mappings'; import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils'; /** Build deduplicated comparison dates, excluding the main run date. */ @@ -216,7 +216,14 @@ export function useChartData( ? 'P99 Time To First Token (s)' : 'Median Time To First Token (s)'; - if (effectiveXMetric && chartDef.chartType === 'interactivity' && isInputMetric) { + const isAgentic = selectedSequence === Sequence.AgenticTraces; + + if ( + effectiveXMetric && + chartDef.chartType === 'interactivity' && + isInputMetric && + !isAgentic + ) { xAxisField = effectiveXMetric as keyof AggDataEntry; const labelKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition; if (effectiveXMetric === chartDef[`${selectedYAxisMetric}_x` as keyof ChartDefinition]) { @@ -225,15 +232,40 @@ export function useChartData( xAxisLabel = isTtftOverride ? ttftLabel : chartDef.x_label; } } else if (chartDef.chartType === 'interactivity' && isInputMetric) { + // Agentic falls through here too — the manual X-axis dropdown is + // hidden in agentic mode (would double up with the percentile + // selector), so the config default + percentile post-processing + // below drives the x axis. const xOverrideKey = `${selectedYAxisMetric}_x` as keyof ChartDefinition; const xLabelOverrideKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition; xAxisField = (chartDef[xOverrideKey] as keyof AggDataEntry) || chartDef.x; xAxisLabel = (chartDef[xLabelOverrideKey] as string) || chartDef.x_label; - } else if (chartDef.chartType === 'e2e' && isTtftOverride) { + } else if (chartDef.chartType === 'e2e' && isTtftOverride && !isAgentic) { xAxisField = effectiveXMetric as keyof AggDataEntry; xAxisLabel = ttftLabel; } + // Agentic: rewrite the resolved x metric to the chosen percentile, + // and relabel accordingly. naturalX is already percentile-adjusted, + // so the per-metric override path is the only one that actually + // changes here. + if (isAgentic) { + const adjusted = withPercentile( + xAxisField as string, + selectedPercentile, + ) as keyof AggDataEntry; + if (adjusted !== xAxisField) { + const pctlWord = + selectedPercentile === 'median' + ? 'Median' + : selectedPercentile === 'p99.9' + ? 'P99.9' + : selectedPercentile.toUpperCase(); + xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord); + xAxisField = adjusted; + } + } + // The x-axis is "flipped" only when the good-direction reverses // (e.g. interactivity → TTFT: "higher is better" → "lower is better"). // E2EL → TTFT keeps the same direction ("lower is better" for both), @@ -269,7 +301,13 @@ export function useChartData( xAxisField, }; }), - [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, selectedPercentile], + [ + selectedYAxisMetric, + selectedXAxisMetric, + selectedE2eXAxisMetric, + selectedPercentile, + selectedSequence, + ], ); // Build renderable graphs (data processing + stable chart definitions) diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts index a2d9ef2e..cddeba54 100644 --- a/packages/app/src/components/inference/types.ts +++ b/packages/app/src/components/inference/types.ts @@ -50,23 +50,33 @@ export interface AggDataEntry { mean_ttft: number; median_ttft: number; std_ttft: number; + p90_ttft: number; p99_ttft: number; + 'p99.9_ttft': number; mean_tpot: number; mean_intvty: number; median_tpot: number; median_intvty: number; std_tpot: number; std_intvty: number; + p90_tpot: number; + p90_intvty: number; p99_tpot: number; p99_intvty: number; + 'p99.9_tpot': number; + 'p99.9_intvty': number; mean_itl: number; median_itl: number; std_itl: number; + p90_itl: number; p99_itl: number; + 'p99.9_itl': number; mean_e2el: number; median_e2el: number; std_e2el: number; + p90_e2el: number; p99_e2el: number; + 'p99.9_e2el': number; disagg: boolean; num_prefill_gpu: number; num_decode_gpu: number; diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx index 6707bd9e..7b4fa08f 100644 --- a/packages/app/src/components/inference/ui/ChartControls.tsx +++ b/packages/app/src/components/inference/ui/ChartControls.tsx @@ -269,7 +269,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
{graphs.some((g) => g.chartDefinition?.chartType === 'interactivity') && - isInputMetric && ( + isInputMetric && + selectedSequence !== Sequence.AgenticTraces && (
): Record { const out: Record = {}; - for (const suffix of ['mean', 'median', 'p90', 'p99']) { + for (const suffix of ['mean', 'median', 'p90', 'p99', 'p99.9']) { const itl = m[`${suffix}_itl`]; const ttlt = m[`${suffix}_ttlt`]; if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt; @@ -62,23 +62,33 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { mean_ttft: m.mean_ttft ?? 0, median_ttft: m.median_ttft ?? 0, std_ttft: m.std_ttft ?? 0, + p90_ttft: m.p90_ttft ?? 0, p99_ttft: m.p99_ttft ?? 0, + 'p99.9_ttft': m['p99.9_ttft'] ?? 0, mean_tpot: m.mean_tpot ?? 0, median_tpot: m.median_tpot ?? 0, std_tpot: m.std_tpot ?? 0, + p90_tpot: m.p90_tpot ?? 0, p99_tpot: m.p99_tpot ?? 0, + 'p99.9_tpot': m['p99.9_tpot'] ?? 0, mean_intvty: m.mean_intvty ?? 0, median_intvty: m.median_intvty ?? 0, std_intvty: m.std_intvty ?? 0, + p90_intvty: m.p90_intvty ?? 0, p99_intvty: m.p99_intvty ?? 0, + 'p99.9_intvty': m['p99.9_intvty'] ?? 0, mean_itl: m.mean_itl ?? 0, median_itl: m.median_itl ?? 0, std_itl: m.std_itl ?? 0, + p90_itl: m.p90_itl ?? 0, p99_itl: m.p99_itl ?? 0, + 'p99.9_itl': m['p99.9_itl'] ?? 0, mean_e2el: m.mean_e2el ?? 0, median_e2el: m.median_e2el ?? 0, std_e2el: m.std_e2el ?? 0, + p90_e2el: m.p90_e2el ?? 0, p99_e2el: m.p99_e2el ?? 0, + 'p99.9_e2el': m['p99.9_e2el'] ?? 0, disagg: row.disagg, num_prefill_gpu: row.num_prefill_gpu, num_decode_gpu: row.num_decode_gpu, diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts index f137875c..bf48c864 100644 --- a/packages/app/src/lib/data-mappings.ts +++ b/packages/app/src/lib/data-mappings.ts @@ -186,21 +186,17 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[]; /** * Percentile of the latency distribution used for the chart x-axis when * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants - * for ttft, ttlt (=e2el), and itl (and intvty derived from itl) — pick which - * slice to plot. + * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only the + * two most commonly read slices (p50, p99) are surfaced in the UI. */ export enum Percentile { Median = 'median', - P90 = 'p90', P99 = 'p99', - P99_9 = 'p99.9', } const PERCENTILE_CONFIG: Record = { [Percentile.Median]: { label: 'p50 (median)' }, - [Percentile.P90]: { label: 'p90' }, [Percentile.P99]: { label: 'p99' }, - [Percentile.P99_9]: { label: 'p99.9' }, }; export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[]; diff --git a/packages/app/src/lib/energy-metrics.test.ts b/packages/app/src/lib/energy-metrics.test.ts index 28cc1e36..54788585 100644 --- a/packages/app/src/lib/energy-metrics.test.ts +++ b/packages/app/src/lib/energy-metrics.test.ts @@ -57,23 +57,33 @@ function makeEntry(overrides: Partial = {}): AggDataEntry { mean_ttft: 0.5, median_ttft: 0.4, std_ttft: 0.1, + p90_ttft: 0.7, p99_ttft: 0.8, + 'p99.9_ttft': 0.9, mean_tpot: 0.02, mean_intvty: 45, median_tpot: 0.02, median_intvty: 44, std_tpot: 0.005, std_intvty: 5, + p90_tpot: 0.025, + p90_intvty: 55, p99_tpot: 0.03, p99_intvty: 60, + 'p99.9_tpot': 0.035, + 'p99.9_intvty': 65, mean_itl: 0.01, median_itl: 0.01, std_itl: 0.002, + p90_itl: 0.013, p99_itl: 0.015, + 'p99.9_itl': 0.018, mean_e2el: 5, median_e2el: 4.8, std_e2el: 0.5, + p90_e2el: 5.5, p99_e2el: 6, + 'p99.9_e2el': 6.5, disagg: false, num_prefill_gpu: 0, num_decode_gpu: 0, From 50a06d1419c70ddd8d24b2c6545da44fe6be3a4d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 15 May 2026 12:27:19 -0500 Subject: [PATCH 13/55] fix(agentic): default percentile to p99 and drop median option --- packages/app/src/components/inference/InferenceContext.tsx | 2 +- packages/app/src/lib/data-mappings.ts | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index b4ccb9ef..af2d364e 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -122,7 +122,7 @@ export function InferenceProvider({ // Latency percentile applied to the chart x-axis for agentic scenarios. // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore. const [selectedPercentile, setSelectedPercentile] = useState( - () => getUrlParam('i_pctl') || 'median', + () => getUrlParam('i_pctl') || 'p99', ); const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>( () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto', diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts index bf48c864..1b4f47c3 100644 --- a/packages/app/src/lib/data-mappings.ts +++ b/packages/app/src/lib/data-mappings.ts @@ -186,16 +186,14 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[]; /** * Percentile of the latency distribution used for the chart x-axis when * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants - * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only the - * two most commonly read slices (p50, p99) are surfaced in the UI. + * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only p99 + * is surfaced in the UI. */ export enum Percentile { - Median = 'median', P99 = 'p99', } const PERCENTILE_CONFIG: Record = { - [Percentile.Median]: { label: 'p50 (median)' }, [Percentile.P99]: { label: 'p99' }, }; From 3c96e9137776d1c368a0acdfeee6e769d5733464 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 15 May 2026 12:31:27 -0500 Subject: [PATCH 14/55] fix(agentic): keep only p90 as the percentile option --- packages/app/src/components/inference/InferenceContext.tsx | 2 +- packages/app/src/lib/data-mappings.ts | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index 0ba14a21..accfdf9e 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -136,7 +136,7 @@ export function InferenceProvider({ // Latency percentile applied to the chart x-axis for agentic scenarios. // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore. const [selectedPercentile, setSelectedPercentile] = useState( - () => getUrlParam('i_pctl') || 'p99', + () => getUrlParam('i_pctl') || 'p90', ); const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>( () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto', diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts index 0afb304a..83e6648a 100644 --- a/packages/app/src/lib/data-mappings.ts +++ b/packages/app/src/lib/data-mappings.ts @@ -191,12 +191,10 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[]; */ export enum Percentile { P90 = 'p90', - P99 = 'p99', } const PERCENTILE_CONFIG: Record = { [Percentile.P90]: { label: 'p90' }, - [Percentile.P99]: { label: 'p99' }, }; export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[]; From 642081af77c8165ac89a5177abbd6c0244dfb9c0 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Fri, 15 May 2026 13:31:30 -0400 Subject: [PATCH 15/55] fix(agentic): default percentile to p90, surface only p90/p99 Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/app/cypress/support/mock-data.ts | 2 +- .../app/src/components/inference/InferenceContext.tsx | 2 +- .../app/src/components/inference/hooks/useChartData.ts | 9 ++------- packages/app/src/components/ui/chart-selectors.tsx | 2 +- packages/app/src/lib/data-mappings.ts | 6 ++++-- packages/app/src/lib/url-state.ts | 2 +- 6 files changed, 10 insertions(+), 13 deletions(-) diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts index f267dcc9..34b89aba 100644 --- a/packages/app/cypress/support/mock-data.ts +++ b/packages/app/cypress/support/mock-data.ts @@ -189,7 +189,7 @@ export function createMockInferenceContext( workflowInfo: null, selectedYAxisMetric: 'y_tpPerGpu', setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'), - selectedPercentile: 'median', + selectedPercentile: 'p90', setSelectedPercentile: namedStub('setSelectedPercentile'), selectedXAxisMetric: null, setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'), diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index accfdf9e..36dc672d 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -134,7 +134,7 @@ export function InferenceProvider({ () => getUrlParam('i_e2e_xmetric') || null, ); // Latency percentile applied to the chart x-axis for agentic scenarios. - // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore. + // Values: 'p90' | 'p99'. Non-agentic charts ignore. const [selectedPercentile, setSelectedPercentile] = useState( () => getUrlParam('i_pctl') || 'p90', ); diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index f2ef85ec..436fd662 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -83,7 +83,7 @@ export function useChartData( selectedRunDate?: string, enabled = true, latestAvailableDate?: string, - selectedPercentile = 'median', + selectedPercentile = 'p90', /** When set, only series for these two registry GPU keys are shown (compare pages). */ compareGpuPair?: readonly [string, string] | null, ) { @@ -261,12 +261,7 @@ export function useChartData( selectedPercentile, ) as keyof AggDataEntry; if (adjusted !== xAxisField) { - const pctlWord = - selectedPercentile === 'median' - ? 'Median' - : selectedPercentile === 'p99.9' - ? 'P99.9' - : selectedPercentile.toUpperCase(); + const pctlWord = selectedPercentile.toUpperCase(); xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord); xAxisField = adjusted; } diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx index d2940de4..e30816fa 100644 --- a/packages/app/src/components/ui/chart-selectors.tsx +++ b/packages/app/src/components/ui/chart-selectors.tsx @@ -315,7 +315,7 @@ export function PercentileSelector({ - P99 TTFT - Median TTFT + P90 TTFT
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index f0e1692a..78df2c37 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -408,27 +408,20 @@ export default function ChartDisplay() { if ( graph.chartDefinition.chartType === 'interactivity' && isInputMetric && - selectedXAxisMetric + selectedXAxisMetric === 'p90_ttft' ) { - if (selectedXAxisMetric === 'p99_ttft') { - return 'vs. P99 Time To First Token'; - } else if (selectedXAxisMetric === 'median_ttft') { - return 'vs. Median Time To First Token'; - } + return 'vs. P90 Time To First Token'; } // For e2e chart: render clickable inline dropdown for x-axis if (graph.chartDefinition.chartType === 'e2e') { const xAxisLabel = - selectedE2eXAxisMetric === 'p99_ttft' - ? 'P99 TTFT' - : selectedE2eXAxisMetric === 'median_ttft' - ? 'Median TTFT' - : 'End-to-end Latency'; + selectedE2eXAxisMetric === 'p90_ttft' + ? 'P90 TTFT' + : 'End-to-end Latency'; const xAxisOptions = [ { value: null, label: 'End-to-end Latency' }, - { value: 'p99_ttft', label: 'P99 TTFT' }, - { value: 'median_ttft', label: 'Median TTFT' }, + { value: 'p90_ttft', label: 'P90 TTFT' }, ]; const zoomPrefix = selectedDateRange.startDate && diff --git a/packages/app/src/components/inference/utils.test.ts b/packages/app/src/components/inference/utils.test.ts index 8f8705e1..589ba580 100644 --- a/packages/app/src/components/inference/utils.test.ts +++ b/packages/app/src/components/inference/utils.test.ts @@ -157,12 +157,12 @@ describe('processOverlayChartData', () => { }); it('remaps x to config override for input metrics on interactivity chart', () => { - // inputTputPerGpu has x override to p99_ttft on interactivity chart + // inputTputPerGpu has x override to p90_ttft on interactivity chart const data = [ pt({ x: 100, inputTputPerGpu: { y: 5, roof: false }, - p99_ttft: 0.25, + p90_ttft: 0.25, median_intvty: 50, } as any), ]; @@ -176,16 +176,11 @@ describe('processOverlayChartData', () => { pt({ x: 100, inputTputPerGpu: { y: 5, roof: false }, - median_ttft: 0.1, + p90_ttft: 0.1, median_intvty: 50, } as any), ]; - const result = processOverlayChartData( - data, - 'interactivity', - 'y_inputTputPerGpu', - 'median_ttft', - ); + const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft'); expect(result).toHaveLength(1); expect(result[0].x).toBe(0.1); }); @@ -195,76 +190,62 @@ describe('processOverlayChartData', () => { pt({ x: 100, inputTputPerGpu: { y: 5, roof: false }, - p99_ttft: 0.25, + p90_ttft: 0.25, median_e2el: 2.5, } as any), ]; const result = processOverlayChartData(data, 'e2e', 'y_inputTputPerGpu', null); expect(result).toHaveLength(1); - // e2e uses median_e2el as x (from chart config default), not p99_ttft + // e2e uses median_e2el as x (from chart config default), not p90_ttft expect(result[0].x).toBe(2.5); }); - it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p99_ttft', () => { - const data = [ - pt({ - x: 100, - tpPerGpu: { y: 42, roof: false }, - p99_ttft: 0.35, - median_e2el: 2.5, - } as any), - ]; - const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft'); - expect(result).toHaveLength(1); - expect(result[0].x).toBe(0.35); - }); - - it('remaps x to TTFT for e2e chart when selectedXAxisMetric is median_ttft', () => { + it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p90_ttft', () => { const data = [ pt({ x: 100, tpPerGpu: { y: 42, roof: false }, - median_ttft: 0.12, + p90_ttft: 0.12, median_e2el: 2.5, } as any), ]; - const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'median_ttft'); + const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft'); expect(result).toHaveLength(1); expect(result[0].x).toBe(0.12); }); it('filters e2e TTFT outliers exceeding y_latency_limit', () => { const data = [ - pt({ tpPerGpu: { y: 10, roof: false }, p99_ttft: 0.5, median_e2el: 1 } as any), - pt({ tpPerGpu: { y: 5, roof: false }, p99_ttft: 999, median_e2el: 2 } as any), + pt({ tpPerGpu: { y: 10, roof: false }, p90_ttft: 0.5, median_e2el: 1 } as any), + pt({ tpPerGpu: { y: 5, roof: false }, p90_ttft: 999, median_e2el: 2 } as any), ]; - const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft'); + const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft'); // y_latency_limit is 60 in the e2e chart config — the 999 outlier should be filtered expect(result).toHaveLength(1); expect(result[0].x).toBe(0.5); }); it('does not filter interactivity points by latency limit when x-axis is default', () => { - // Regression: selectedXAxisMetric defaults to 'p99_ttft' but the interactivity + // Regression: selectedXAxisMetric defaults to 'p90_ttft' but the interactivity // chart's x-axis stays median_intvty for non-input metrics. The latency limit // (60) must NOT apply to median_intvty values. const data = [ pt({ tpPerGpu: { y: 42, roof: false }, median_intvty: 200 } as any), pt({ tpPerGpu: { y: 10, roof: false }, median_intvty: 30 } as any), ]; - const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p99_ttft'); + const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p90_ttft'); expect(result).toHaveLength(2); }); it('applies latency limit on interactivity only when x-axis is actually overridden', () => { - // When an input metric IS selected and x-axis overrides to p99_ttft, + // When an input metric IS selected and x-axis overrides to p90_ttft, // the latency limit should apply. const data = [ - pt({ inputTputPerGpu: { y: 5, roof: false }, p99_ttft: 0.5, median_intvty: 10 } as any), - pt({ inputTputPerGpu: { y: 3, roof: false }, p99_ttft: 999, median_intvty: 20 } as any), + pt({ inputTputPerGpu: { y: 5, roof: false }, p90_ttft: 0.5, median_intvty: 10 } as any), + pt({ inputTputPerGpu: { y: 3, roof: false }, p90_ttft: 999, median_intvty: 20 } as any), ]; - const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p99_ttft'); - // x-axis is overridden to p99_ttft for input metric — latency limit SHOULD filter 999 + const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft'); + // x-axis is overridden to p90_ttft for input metric — latency limit SHOULD filter 999 expect(result).toHaveLength(1); expect(result[0].x).toBe(0.5); }); diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts index 4b5335b6..735007ab 100644 --- a/packages/app/src/components/inference/utils.ts +++ b/packages/app/src/components/inference/utils.ts @@ -88,8 +88,7 @@ export function processOverlayChartData( let xAxisField: string = chartDef.x; // selectedXAxisMetric is already the effective metric for this chart type // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric) - const isTtftOverride = - selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft'; + const isTtftOverride = selectedXAxisMetric === 'p90_ttft'; if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) { xAxisField = selectedXAxisMetric; diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx index e30816fa..19b4bfb0 100644 --- a/packages/app/src/components/ui/chart-selectors.tsx +++ b/packages/app/src/components/ui/chart-selectors.tsx @@ -315,7 +315,7 @@ export function PercentileSelector({ From 19b99586353cd39bccd4072bd6e2a2afcaf73367 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 26 May 2026 18:32:26 -0500 Subject: [PATCH 43/55] fix(scenario-selector): wrap Deprecated header in SelectLabel only inside Select MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous commit (b3e315c) changed DeprecatedSectionTitle to render SelectLabel internally, which throws at runtime ("SelectLabel must be used within SelectGroup") in callsites that render the header via MultiSelect — MultiSelect wraps the header in its own div, not a Radix SelectGroup. Revert the component to a plain styled span (MultiSelect's div wrapper supplies the small/muted styling), and wrap with SelectLabel only at the ScenarioSelector callsite, where the header sits directly inside a SelectGroup. Co-Authored-By: Claude Opus 4.7 --- .../app/src/components/ui/chart-selectors.tsx | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx index 8b91059a..49ea3f1a 100644 --- a/packages/app/src/components/ui/chart-selectors.tsx +++ b/packages/app/src/components/ui/chart-selectors.tsx @@ -31,9 +31,16 @@ import { sequenceKind, } from '@/lib/data-mappings'; +/** + * "Deprecated" sub-header used by selectors. Rendered as a span (not + * SelectLabel) because some callsites use `MultiSelect`, which wraps + * headers in its own div and isn't a SelectGroup. The span carries no + * styling of its own — the parent context supplies the muted/small + * treatment. ScenarioSelector renders this inside a SelectLabel directly. + */ function DeprecatedSectionTitle({ reason }: { reason: string }) { return ( - + Deprecated @@ -43,7 +50,7 @@ function DeprecatedSectionTitle({ reason }: { reason: string }) { {reason} - + ); } @@ -282,7 +289,9 @@ export function ScenarioSelector({ ))} {fixedGroups.deprecated.length > 0 && ( <> - + + + {fixedGroups.deprecated.map((seq) => ( {getSequenceLabel(seq as Sequence)} From 7114833409b92a206f7c22b80846db527e01da43 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 13:22:13 -0500 Subject: [PATCH 44/55] feat(agentic-detail): add cumulative input tokens chart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surfaces a new chart on the agentic detail page showing the running total of input (prompt) tokens served over the course of the run — useful for seeing how the load actually accumulates vs the instantaneous prefill_tps line we already plot. Adds a `cumulativeSum` helper alongside the existing `cumulativeAverage` and `sumSeries` time-series utilities. No backfill needed — the source data (`chart_series.prefillTps`) is already pre-computed at ingest time for every blob-bearing row. (Input throughput as a Pareto axis is already wired via the existing `y_inputTputPerGpu` y-axis option; no change there.) Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/agentic-point-detail.tsx | 24 +++++++++++++++++++ .../agentic-point/time-series-chart.tsx | 17 +++++++++++++ 2 files changed, 41 insertions(+) diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 2e43b4fb..1a61b93b 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -26,6 +26,7 @@ import { StackedAreaChart, TimeSeriesChart, cumulativeAverage, + cumulativeSum, rollingAverage, sumSeries, } from './time-series-chart'; @@ -381,6 +382,29 @@ export function AgenticPointDetail({ id }: Props) { ); }} /> + + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!metrics) return ; + return ( + + ); + }} + /> )} diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx index cd10aff7..042c4331 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -58,6 +58,23 @@ export function cumulativeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] { return out; } +/** + * Running cumulative sum of a per-interval rate series. Each output point + * is the integral of the rate from start to that point, assuming the rate + * applies over a 1-second window (aiperf's scrape interval). Use for + * "total tokens served so far" from a tokens-per-second series. + */ +export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] { + if (data.length === 0) return data; + const out: TimeSeriesPoint[] = Array.from({ length: data.length }); + let sum = 0; + for (let i = 0; i < data.length; i++) { + sum += data[i]!.value; + out[i] = { t: data[i]!.t, value: sum }; + } + return out; +} + /** Pointwise sum of two arrays sharing the same t index. */ export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] { const n = Math.min(a.length, b.length); From c6697de8ff3d8263924986fd71b4622f1369f9a3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 14:44:19 -0500 Subject: [PATCH 45/55] feat(agentic-detail): plot cumulative unique input tokens MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the "Total input tokens over time" chart with "Total unique input tokens over time" — cumsum of (prompt-token rate − prefix-cache- hit rate per second), which equals the cumulative tokens vllm actually had to prefill from scratch (= vllm:request_prefill_kv_computed_tokens). Adds `prefixCacheHitsTps` to the chart_series JSONB (extracted by summing vllm:prefix_cache_hits.rate across all engine series, same DP- aware path as prefillTps). Bumps CHART_SERIES_VERSION to 3; the existing trace-server-metrics query defaults the field to [] for any older v2 rows so reads stay safe before backfill catches up. Backfilled 62 rows to v3. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/agentic-point-detail.tsx | 14 +++++++++++--- .../src/hooks/api/use-trace-server-metrics.ts | 2 ++ packages/db/src/etl/compute-chart-series.ts | 16 +++++++++++++++- packages/db/src/queries/trace-server-metrics.ts | 4 ++++ 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 1a61b93b..4bebd37c 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -384,16 +384,24 @@ export function AgenticPointDetail({ id }: Props) { /> { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; if (!metrics) return ; + // Unique = total prompt tokens vllm received minus the tokens + // it served from the prefix cache. The cache-miss portion is + // what actually constitutes "new content" the GPU had to + // process — equivalent to cumsum of vllm:request_prefill_kv_computed_tokens. + const unique = sumSeries( + metrics.prefillTps, + metrics.prefixCacheHitsTps.map((p) => ({ t: p.t, value: -p.value })), + ); return ( ; prefillTps: TimeSeriesPoint[]; decodeTps: TimeSeriesPoint[]; + /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */ + prefixCacheHitsTps: TimeSeriesPoint[]; } async function fetchTraceServerMetrics( diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 530600cf..91e89521 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -25,8 +25,11 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * only series[0], which under-counted by Nx on multi-engine DP/PP * deployments — most visible as a request-queue-depth chart that maxed out * at ~3 when the timeline clearly showed 20+ in-flight). + * + * v3: extract `prefixCacheHitsTps` so the detail page can derive cumulative + * unique input tokens as cumsum(prefillTps - prefixCacheHitsTps). */ -export const CHART_SERIES_VERSION = 2; +export const CHART_SERIES_VERSION = 3; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -57,6 +60,13 @@ export interface ChartSeries { promptTokensBySource: Record; prefillTps: TimeSeriesPoint[]; decodeTps: TimeSeriesPoint[]; + /** + * Per-scrape rate (tokens/sec) of vllm:prefix_cache_hits, summed across + * engines. Detail page derives "cumulative unique input tokens" as + * cumsum(prefillTps - prefixCacheHitsTps) — what the cache actually + * saved vs the raw queries that came in. + */ + prefixCacheHitsTps: TimeSeriesPoint[]; } // ── Raw blob shapes (subset we read) ──────────────────────────────────── @@ -249,6 +259,9 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { })); const prefillTps = counterRate('vllm:prompt_tokens'); const decodeTps = counterRate('vllm:generation_tokens'); + // Tokens served from prefix cache per scrape. Lets the frontend derive + // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits). + const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits'); // Per-source prompt tokens — sum across engines per source label. const promptBySrcByT = new Map>(); @@ -286,5 +299,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { promptTokensBySource, prefillTps, decodeTps, + prefixCacheHitsTps, }; } diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts index 624b6ed3..76775e77 100644 --- a/packages/db/src/queries/trace-server-metrics.ts +++ b/packages/db/src/queries/trace-server-metrics.ts @@ -71,6 +71,8 @@ export interface TraceServerMetrics { prefillTps: TimeSeriesPoint[]; /** Decode throughput: vllm:generation_tokens rate (tokens/sec) per scrape. */ decodeTps: TimeSeriesPoint[]; + /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */ + prefixCacheHitsTps: TimeSeriesPoint[]; } interface RawMetaRow extends PointMeta { @@ -114,6 +116,8 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics { promptTokensBySource: series.promptTokensBySource, prefillTps: series.prefillTps, decodeTps: series.decodeTps, + // v2 chart_series rows pre-backfill don't have this field — default to [] + prefixCacheHitsTps: series.prefixCacheHitsTps ?? [], }; } From b5679bb10acfd6a6765b48a5864b2a0ec73d4915 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 15:00:12 -0500 Subject: [PATCH 46/55] feat(request-timeline): expandable subagent -> stream rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The harness fans a single subagent into multiple parallel ":sN" streams when its inner requests overlap in time (weka_trace._pack_into_streams). Previously each :sN got its own swimlane row, which made one parent conversation with 5 subagents (each fanned into 2-8 streams) render as 23 separate rows — visually implying 23 distinct subagent invocations when really there are 5. Now: each subagent shows as one row by default with a chevron + stream count chip ("subagent 003 · f1e7 ×8"). The collapsed row draws the union of all stream bars overlaid, so the concurrency burst is still visible at a glance. Click the chevron to fan into per-stream rows; click again to collapse. For conv 0f5b266f in benchmark 206360: 23 rows → 5 rows by default. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/request-timeline.tsx | 325 ++++++++++++------ 1 file changed, 226 insertions(+), 99 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index bcbe105a..8762a158 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -53,44 +53,84 @@ const PHASE_COLORS: Record = { unknown: '#64748b', }; +/** + * Row kinds: + * parent — top-level conversation (depth 0) + * worker — worker swimlane (depth 0, worker mode) + * subagent — a subagent invocation (depth 1). Either a single + * stream (renders its own bars), or a multi-stream + * container whose bars are the union of its streams + * when collapsed. + * stream — one :sN stream of a multi-stream subagent (depth 2). + * Hidden by default; toggled in via the parent's chevron. + */ +type RowKind = 'parent' | 'worker' | 'subagent' | 'stream'; + interface Row { key: string; label: string; color: string; requests: RequestRecord[]; - /** 0 = top-level conversation/worker, 1+ = sub-agent under its parent. */ depth: number; - /** True if this row is a sub-agent ("Subagent N of parent X"). */ - isSubagent: boolean; + kind: RowKind; + /** Number of streams under this subagent (>=1). Only set for subagent rows. */ + streamCount?: number; + /** For stream rows: the parent subagent's row key (drives expand/collapse). */ + parentRowKey?: string; } /** * Conversation ids for subagent calls look like - * ::sa:subagent__ - * Split into the parent cid and a sub-agent label (or the whole thing if - * this is a top-level conversation). + * ::sa:[:s] + * The optional `:s` suffix is set when the harness fans a single + * subagent into multiple parallel "streams" (interval-graph + * decomposition in weka_trace._pack_into_streams). We split it off so + * we can group all streams of one subagent under a single header row. */ -function splitCid(cid: string): { parent: string; subagent: string | null } { +function splitCid(cid: string): { + parent: string; + subagentBase: string | null; + stream: number | null; +} { const sep = cid.indexOf('::sa:'); - if (sep === -1) return { parent: cid, subagent: null }; - return { parent: cid.slice(0, sep), subagent: cid.slice(sep + 5) }; + if (sep === -1) return { parent: cid, subagentBase: null, stream: null }; + const parent = cid.slice(0, sep); + const raw = cid.slice(sep + 5); + const m = /^(.*):s(\d+)$/.exec(raw); + if (m) return { parent, subagentBase: m[1]!, stream: Number(m[2]) }; + return { parent, subagentBase: raw, stream: null }; } -/** Group requests into rows; in conversation mode subagents nest under parents. */ -function buildRows(requests: RequestRecord[], mode: RowMode): Row[] { - const groups = new Map(); - for (const r of requests) { - const key = mode === 'conversation' ? r.cid : r.wid; - let list = groups.get(key); - if (!list) { - list = []; - groups.set(key, list); - } - list.push(r); - } - +/** + * Group requests into rows. In conversation mode, output order is: + * parent_conv + * subagent_001 (collapsed by default, container) + * :s0 (hidden unless expanded) + * :s1 + * subagent_002 + * ... + * + * `expandedSubagents` controls which subagent containers reveal their + * stream children. Bars on a collapsed subagent are the UNION of all its + * streams' requests — overlapping bars visually communicate the + * stream-level parallelism without expanding. + */ +function buildRows( + requests: RequestRecord[], + mode: RowMode, + expandedSubagents: ReadonlySet, +): Row[] { if (mode !== 'conversation') { // Worker mode: flat rows, sorted by first activity. + const groups = new Map(); + for (const r of requests) { + let list = groups.get(r.wid); + if (!list) { + list = []; + groups.set(r.wid, list); + } + list.push(r); + } const rows: Row[] = []; let i = 0; for (const [key, list] of groups) { @@ -101,7 +141,7 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] { color: ROW_COLORS[i % ROW_COLORS.length]!, requests: list, depth: 0, - isSubagent: false, + kind: 'worker', }); i++; } @@ -109,36 +149,40 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] { return rows; } - // Conversation mode: build a parent → [subagents] tree so each parent - // group renders as one parent row followed by its sub-agent rows. Color - // is shared inside a tree so the visual grouping reads. + // Conversation mode — tree: parent → subagent → stream. interface Tree { parentCid: string; - parentRow: { key: string; requests: RequestRecord[] } | null; - subagents: Map; // subagent label → requests + parentReqs: RequestRecord[]; + // subagentBase → (streamIndex|null → requests) + subagents: Map>; firstStart: number; } const trees = new Map(); - for (const [cid, list] of groups) { - list.sort((a, b) => a.start - b.start); - const { parent, subagent } = splitCid(cid); + for (const r of requests) { + const { parent, subagentBase, stream } = splitCid(r.cid); let tree = trees.get(parent); if (!tree) { tree = { parentCid: parent, - parentRow: null, + parentReqs: [], subagents: new Map(), firstStart: Number.POSITIVE_INFINITY, }; trees.set(parent, tree); } - if (subagent === null) { - tree.parentRow = { key: cid, requests: list }; + if (subagentBase === null) { + tree.parentReqs.push(r); } else { - tree.subagents.set(subagent, list); + let saMap = tree.subagents.get(subagentBase); + if (!saMap) { + saMap = new Map(); + tree.subagents.set(subagentBase, saMap); + } + const list = saMap.get(stream); + if (list) list.push(r); + else saMap.set(stream, [r]); } - const earliest = list[0]!.start; - if (earliest < tree.firstStart) tree.firstStart = earliest; + if (r.start < tree.firstStart) tree.firstStart = r.start; } const sortedTrees = [...trees.values()].toSorted((a, b) => a.firstStart - b.firstStart); @@ -147,39 +191,66 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] { for (const tree of sortedTrees) { const color = ROW_COLORS[colorIdx % ROW_COLORS.length]!; colorIdx++; - if (tree.parentRow) { + // Parent row (use a placeholder key if the parent itself wasn't replayed). + tree.parentReqs.sort((a, b) => a.start - b.start); + rows.push({ + key: tree.parentReqs.length > 0 ? tree.parentCid : `__parent_${tree.parentCid}`, + label: tree.parentCid, + color, + requests: tree.parentReqs, + depth: 0, + kind: 'parent', + }); + + // One subagent row per base (which may contain N streams). + const subagentEntries = [...tree.subagents.entries()].toSorted((a, b) => { + const aStart = Math.min( + ...[...a[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY), + ); + const bStart = Math.min( + ...[...b[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY), + ); + return aStart - bStart; + }); + for (const [saBase, streams] of subagentEntries) { + const subagentKey = `${tree.parentCid}::sa:${saBase}`; + // Union of all stream requests for collapsed-view bars. + const allReqs: RequestRecord[] = []; + for (const reqs of streams.values()) allReqs.push(...reqs); + allReqs.sort((a, b) => a.start - b.start); + const streamCount = streams.size; rows.push({ - key: tree.parentRow.key, - label: shortenCid(tree.parentCid), + key: subagentKey, + label: `↳ ${formatSubagentLabel(saBase)}`, color, - requests: tree.parentRow.requests, - depth: 0, - isSubagent: false, - }); - } else { - // Pseudo-parent header so orphan subagents still render under - // something they belong to. - rows.push({ - key: `__parent_${tree.parentCid}`, - label: shortenCid(tree.parentCid), - color, - requests: [], - depth: 0, - isSubagent: false, - }); - } - const subagentEntries = [...tree.subagents.entries()].toSorted( - (a, b) => a[1][0]!.start - b[1][0]!.start, - ); - for (const [saLabel, list] of subagentEntries) { - rows.push({ - key: `${tree.parentCid}::${saLabel}`, - label: `↳ ${formatSubagentLabel(saLabel)}`, - color, - requests: list, + requests: allReqs, depth: 1, - isSubagent: true, + kind: 'subagent', + streamCount, }); + + // Stream children only when expanded AND there's more than one + // stream (a single-stream subagent has nothing extra to show). + if (streamCount > 1 && expandedSubagents.has(subagentKey)) { + const streamEntries = [...streams.entries()].toSorted((a, b) => { + // Sort by stream index (null first as the "default" stream) + const ai = a[0] ?? -1; + const bi = b[0] ?? -1; + return ai - bi; + }); + for (const [streamIdx, reqs] of streamEntries) { + reqs.sort((a, b) => a.start - b.start); + rows.push({ + key: `${subagentKey}:s${streamIdx ?? '∅'}`, + label: `stream ${streamIdx ?? '∅'}`, + color, + requests: reqs, + depth: 2, + kind: 'stream', + parentRowKey: subagentKey, + }); + } + } } } return rows; @@ -192,11 +263,6 @@ function formatSubagentLabel(raw: string): string { return `subagent ${m[1]} · ${m[2]!.slice(0, 4)}`; } -function shortenCid(cid: string): string { - if (cid.length <= 12) return cid; - return `${cid.slice(0, 8)}…${cid.slice(-4)}`; -} - function shortenWid(wid: string): string { // worker_4ae87bea → w_4ae8 return wid.replace(/^worker_/, 'w_').slice(0, 12); @@ -314,6 +380,17 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { const [rowMode, setRowMode] = useState('conversation'); const [phaseFilter, setPhaseFilter] = useState('profiling'); const [tooltip, setTooltip] = useState(null); + // Which multi-stream subagents currently have their per-stream rows + // expanded. Key is the subagent row's `key` (parent_cid::sa:agent_id). + const [expandedSubagents, setExpandedSubagents] = useState>(() => new Set()); + const toggleSubagent = useCallback((key: string) => { + setExpandedSubagents((prev) => { + const next = new Set(prev); + if (next.has(key)) next.delete(key); + else next.add(key); + return next; + }); + }, []); const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null); // Apply phase filter, then group into rows. @@ -322,7 +399,10 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { phaseFilter === 'all' ? data.requests : data.requests.filter((r) => r.phase === 'profiling'), [data.requests, phaseFilter], ); - const rows = useMemo(() => buildRows(filtered, rowMode), [filtered, rowMode]); + const rows = useMemo( + () => buildRows(filtered, rowMode, expandedSubagents), + [filtered, rowMode, expandedSubagents], + ); // Pre-sort the timestamp columns so the cursor-time stats popover can // count "running / waiting at time t" in O(log n). With a few hundred @@ -359,7 +439,10 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { const isZoomed = viewEnd !== null; // Layout - const LABEL_WIDTH = 160; + // Wide enough for a full 36-char conversation id at 10px font, plus the + // indent + color stripe + count badge. Subagent rows inherit the same + // width but truncate the longer "↳ subagent N · hash" tail with ellipsis. + const LABEL_WIDTH = 360; const ROW_HEIGHT = 22; const ROW_GAP = 3; const HEADER_HEIGHT = 24; @@ -537,33 +620,58 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { {rowMode === 'conversation' ? 'Conversation' : 'Worker'} - {rows.map((row) => ( -
- { + const isSubagentRow = row.kind === 'subagent'; + const isStreamRow = row.kind === 'stream'; + const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1; + const isExpanded = isExpandable && expandedSubagents.has(row.key); + return ( +
- - {row.label} - - - {row.requests.length > 0 ? row.requests.length : '—'} - -
- ))} + {isExpandable ? ( + + ) : ( + + )} + + + {row.label} + {isExpandable && ( + ×{row.streamCount} + )} + + + {row.requests.length > 0 ? row.requests.length : '—'} + +
+ ); + })} {/* Scrollable SVG */} @@ -636,6 +744,16 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { {rows.map((row, rowIdx) => { const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2; const barH = ROW_HEIGHT - 4; + // For multi-stream subagent containers, suppress the union + // bars when expanded — the child stream rows draw them + // individually instead, so we'd double-draw otherwise. + if ( + row.kind === 'subagent' && + (row.streamCount ?? 1) > 1 && + expandedSubagents.has(row.key) + ) { + return null; + } return row.requests.map((req) => { const xCredit = xOf(req.credit); const xStart = xOf(req.start); @@ -663,7 +781,8 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { opacity={0.35} /> )} - {/* Main bar */} + {/* Main bar — opacity stepped down with depth so + parent > subagent > stream reads visually. */} {/* Phase strip at bottom */} Date: Wed, 27 May 2026 15:07:27 -0500 Subject: [PATCH 47/55] fix(agentic-detail): make unique-input-tokens chart monotonic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vllm's per-scrape prompt_tokens.rate and prefix_cache_hits.rate counters can lag each other by several seconds across scrapes (we see prefill=0 at one tick with hits=1.1M, then prefill=1.5M with hits=452K six ticks later — lifetime totals agree but per-tick they don't). Computing cumsum(prefill - hits) per tick made the chart dip well negative at the start. Replaces the per-tick subtraction with `cumulativeDifferenceMonotonic`: union the two series by timestamp, accumulate each independently, take the diff, then enforce a running max so the curve never decreases. End-of-run totals are unchanged (both counters converge to the right value); transient skew just looks like a brief plateau instead of a negative dip. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/agentic-point-detail.tsx | 21 ++++++----- .../agentic-point/time-series-chart.tsx | 37 +++++++++++++++++++ 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 4bebd37c..1abf64e6 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -26,7 +26,7 @@ import { StackedAreaChart, TimeSeriesChart, cumulativeAverage, - cumulativeSum, + cumulativeDifferenceMonotonic, rollingAverage, sumSeries, } from './time-series-chart'; @@ -388,20 +388,21 @@ export function AgenticPointDetail({ id }: Props) { render={(expanded) => { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; if (!metrics) return ; - // Unique = total prompt tokens vllm received minus the tokens - // it served from the prefix cache. The cache-miss portion is - // what actually constitutes "new content" the GPU had to - // process — equivalent to cumsum of vllm:request_prefill_kv_computed_tokens. - const unique = sumSeries( - metrics.prefillTps, - metrics.prefixCacheHitsTps.map((p) => ({ t: p.t, value: -p.value })), - ); + // Unique = total prompt tokens received minus tokens served + // from the prefix cache. Equivalent to cumsum of + // vllm:request_prefill_kv_computed_tokens. We compute it as + // monotonic-non-decreasing cumulative-diff so per-scrape + // timing skew between the prompt_tokens and prefix_cache_hits + // counters can't make the line dip negative. return ( [p.t, p.value])); + const bByT = new Map(b.map((p) => [p.t, p.value])); + const allT = [...new Set([...aByT.keys(), ...bByT.keys()])].toSorted((x, y) => x - y); + const out: TimeSeriesPoint[] = Array.from({ length: allT.length }); + let cumA = 0; + let cumB = 0; + let runningMax = 0; + for (let i = 0; i < allT.length; i++) { + const t = allT[i]!; + cumA += aByT.get(t) ?? 0; + cumB += bByT.get(t) ?? 0; + const diff = cumA - cumB; + if (diff > runningMax) runningMax = diff; + out[i] = { t, value: runningMax }; + } + return out; +} + /** Pointwise sum of two arrays sharing the same t index. */ export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] { const n = Math.min(a.length, b.length); From 08bbe6650c73935d7ac7a9fa29a722b141911bc9 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 15:15:05 -0500 Subject: [PATCH 48/55] feat(agentic-detail): add unique input tokens in flight chart New chart on the per-point view that plots the deduped count of input tokens currently held by in-flight requests, as a 30s time- weighted rolling average with the raw step series rendered as faint scatter behind it. Useful for seeing the working set the model has to hold KV cache for at any instant. Computation (frontend, from request_timeline): - At each request start/end event, maintain active ISL per cid (within one cid turns are sequential, so each cid contributes at most one in-flight ISL at a time) - total_in_flight(t) = sum over cids with active request of that cid's current ISL - Across cids we treat content as independent (cross-conv prefix sharing measured at <1 pp, so summing is a tight approximation) Adds timeRollingAverage helper: time-weighted (vs sample-count) moving average suitable for irregularly-sampled event series like this one. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/agentic-point-detail.tsx | 43 ++++++++- .../agentic-point/time-series-chart.tsx | 96 +++++++++++++++++++ 2 files changed, 137 insertions(+), 2 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 1abf64e6..2db2809b 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -27,8 +27,10 @@ import { TimeSeriesChart, cumulativeAverage, cumulativeDifferenceMonotonic, + inflightUniqueTokens, rollingAverage, sumSeries, + timeRollingAverage, } from './time-series-chart'; interface Props { @@ -124,8 +126,10 @@ export function AgenticPointDetail({ id }: Props) { // shows how the metric varies across the SKU. const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? []; const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates'); - // Per-request timeline fetched only when the timeline view is active. - const timelineQuery = useRequestTimeline(id, view === 'timeline'); + // Per-request timeline used by both the timeline view AND the per-point + // "Unique input tokens in flight" chart, so fetch whenever we're on + // either view. + const timelineQuery = useRequestTimeline(id, view === 'timeline' || view === 'point'); return (
@@ -414,6 +418,41 @@ export function AgenticPointDetail({ id }: Props) { ); }} /> + + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!timelineQuery.data) { + return timelineQuery.isLoading ? : ; + } + // Step function: at each request start/end, sum the ISLs of + // currently-active requests across distinct cids. Within one + // cid turns are sequential so each cid contributes at most + // one in-flight ISL; across cids we treat content as + // independent (cross-conv prefix sharing adds <1pp in + // practice). Smooth with a 30s time-weighted rolling average + // so brief turn-handoff dips don't dominate the chart. + const raw = inflightUniqueTokens(timelineQuery.data.requests); + const smoothed = timeRollingAverage(raw, 30); + return ( + + ); + }} + />
)} diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx index 25d5a672..520b3ed6 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -27,6 +27,39 @@ interface TimeSeriesChartProps { height?: number; } +/** + * Time-weighted rolling average over a `windowS`-second trailing window. + * Treats the input as a step function (value held constant between + * samples) and integrates over the trailing window, dividing by the + * window length. Good for smoothing irregularly-sampled event series + * (e.g. request start/end events) where the regular sample-count + * `rollingAverage` would over-weight bursts of close-together events. + */ +export function timeRollingAverage(data: TimeSeriesPoint[], windowS: number): TimeSeriesPoint[] { + if (data.length === 0 || windowS <= 0) return data; + const out: TimeSeriesPoint[] = Array.from({ length: data.length }); + for (let i = 0; i < data.length; i++) { + const tEnd = data[i]!.t; + const tStart = Math.max(0, tEnd - windowS); + // Find the first sample j whose t is >= tStart; the step value at + // tStart is data[j-1].value if j > 0, else data[0].value. + let j = 0; + while (j < data.length && data[j]!.t < tStart) j++; + let prevT = tStart; + let prevV = j > 0 ? data[j - 1]!.value : data[0]!.value; + let area = 0; + for (; j <= i; j++) { + const curT = data[j]!.t; + area += prevV * (curT - prevT); + prevT = curT; + prevV = data[j]!.value; + } + const dur = tEnd - tStart; + out[i] = { t: tEnd, value: dur > 0 ? area / dur : data[i]!.value }; + } + return out; +} + /** Centered rolling average over `windowSize` samples. */ export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): TimeSeriesPoint[] { if (data.length === 0 || windowSize <= 1) return data; @@ -75,6 +108,69 @@ export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] { return out; } +/** + * Per-event step series: at each request start/end, sum the ISLs of + * currently-active requests across distinct `cid`s. Within a single + * `cid` aiperf dispatches turns sequentially (turn N+1 waits for N), + * so each cid contributes at most one in-flight ISL at a time. Across + * different cids we assume content is independent (parent ↔ subagent + * and conv ↔ conv share negligible prefix in practice — cross-conv + * dedup added ~0.25 pp to theoretical hit rate, so treating them as + * independent is a tight approximation of the true in-flight unique + * token count). + * + * Output is a step function: one point per event, value held constant + * until the next event. Time axis is seconds relative to the earliest + * event in `requests`. + */ +export function inflightUniqueTokens( + requests: readonly { cid: string; start: number; end: number; isl: number | null }[], +): TimeSeriesPoint[] { + if (requests.length === 0) return []; + // The request_timeline timestamps are ns-relative to its own origin. + // Convert events to seconds and emit a step series. + interface Event { + tNs: number; + kind: 'start' | 'end'; + cid: string; + isl: number; + } + const events: Event[] = []; + for (const r of requests) { + const isl = r.isl ?? 0; + if (isl <= 0) continue; + events.push({ tNs: r.start, kind: 'start', cid: r.cid, isl }); + events.push({ tNs: r.end, kind: 'end', cid: r.cid, isl }); + } + if (events.length === 0) return []; + // Sort by time; on ties, process 'end' before 'start' so a same-instant + // turn handoff within one cid doesn't transiently double-count. + events.sort((a, b) => a.tNs - b.tNs || (a.kind === 'end' ? -1 : 1)); + + // Active ISL per cid (max in case the same cid somehow has overlapping + // events; in practice it's always 0 or 1 request at a time per cid). + const activeByCid = new Map(); + let total = 0; + const out: TimeSeriesPoint[] = [{ t: 0, value: 0 }]; + for (const e of events) { + const tSec = e.tNs / 1e9; + if (e.kind === 'start') { + const prev = activeByCid.get(e.cid) ?? 0; + const next = Math.max(prev, e.isl); + activeByCid.set(e.cid, next); + total += next - prev; + } else { + const cur = activeByCid.get(e.cid) ?? 0; + if (cur > 0) { + total -= cur; + activeByCid.delete(e.cid); + } + } + out.push({ t: tSec, value: Math.max(0, total) }); + } + return out; +} + /** * Monotonic-non-decreasing cumulative difference of two rate series: * for each unique timestamp, compute Σa[0..t] − Σb[0..t], then enforce From 7561deb1cc5a210ce6cd074ab0d4771b3b9f8342 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 20:30:39 -0500 Subject: [PATCH 49/55] feat(chart-series): extract SGLang metrics alongside vllm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our chart_series + aggregate_stats extractors hardcoded vllm:* metric names, so SGLang runs (e.g. qwen3.5/h100/sglang) ingested cleanly but the per-point detail page rendered empty charts — chart_series fields were all zero-length arrays. Adds fallback chains in each extractor: KV cache util vllm:kv_cache_usage_perc → sglang:token_usage Prefix cache hits vllm:prefix_cache_hits → sglang:cached_tokens Prefix cache qrys vllm:prefix_cache_queries → sglang:prompt_tokens Requests running vllm:num_requests_running → sglang:num_running_reqs Requests waiting vllm:num_requests_waiting → sglang:num_queue_reqs Prompt tokens rate vllm:prompt_tokens → sglang:prompt_tokens Generation rate vllm:generation_tokens → sglang:generation_tokens The `pickFirstNonEmpty` helper walks the chain and uses whichever series has data, so a future framework (mori-sglang, dynamo, etc.) can plug in by adding its names to each chain — no per-framework branching. CHART_SERIES_VERSION → 4, STATS_VERSION → 3. Both backfills re-ran (86 chart_series rows, 190 aggregate_stats rows). SGLang chart_series for qwen3.5 run 944 verified — was 0-length arrays before, now ~1800 samples each. Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/compute-chart-series.ts | 67 +++++++++++++++---- packages/db/src/queries/agentic-aggregates.ts | 56 +++++++++++++--- 2 files changed, 98 insertions(+), 25 deletions(-) diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 91e89521..86b79925 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -28,8 +28,11 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * * v3: extract `prefixCacheHitsTps` so the detail page can derive cumulative * unique input tokens as cumsum(prefillTps - prefixCacheHitsTps). + * + * v4: extract sglang:* metrics too (fallback chain in each picker), so + * SGLang runs populate the chart_series the same way vllm runs do. */ -export const CHART_SERIES_VERSION = 3; +export const CHART_SERIES_VERSION = 4; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -89,8 +92,13 @@ interface RawMetric { type MetricsMap = Record; -/** The set of metric subtrees the chart consumes. */ +/** + * The set of metric subtrees the chart consumes. Includes both vllm:* and + * sglang:* names so the stream-parse fallback collects whichever framework + * the blob was emitted by — `buildSeriesFromMetrics` then picks per metric. + */ const CHART_METRIC_KEYS = new Set([ + // vLLM 'vllm:kv_cache_usage_perc', 'vllm:gpu_cache_usage_perc', 'vllm:prefix_cache_hits', @@ -100,6 +108,13 @@ const CHART_METRIC_KEYS = new Set([ 'vllm:prompt_tokens', 'vllm:generation_tokens', 'vllm:prompt_tokens_by_source', + // SGLang + 'sglang:token_usage', + 'sglang:cached_tokens', + 'sglang:prompt_tokens', + 'sglang:generation_tokens', + 'sglang:num_running_reqs', + 'sglang:num_queue_reqs', ]); /** @@ -220,18 +235,37 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { if (!Number.isFinite(startNs)) startNs = 0; const tOf = (ns: number) => (ns - startNs) / 1e9; + // Pick the first metric name whose series array has any data; fallback + // chain lets the same code path serve both vllm:* and sglang:* blobs. + const pickSeries = (...names: string[]): readonly RawSeries[] | undefined => { + for (const name of names) { + const s = metrics[name]?.series; + if (s && s.length > 0) return s; + } + return undefined; + }; + // KV cache usage (gauge, 0..1) — average across engines so the value // stays a fraction (each engine has its own KV pool). - const kvSeries = - metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series; + const kvSeries = pickSeries( + 'vllm:kv_cache_usage_perc', + 'vllm:gpu_cache_usage_perc', + 'sglang:token_usage', + ); const kvCacheUsage: TimeSeriesPoint[] = sortedEntries( aggregateByStart(kvSeries, 'avg', 'avg'), ).map(([t, v]) => ({ t: tOf(t), value: v })); // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across - // engines, joined on start_ns. - const hitsByT = aggregateByStart(metrics['vllm:prefix_cache_hits']?.series, 'rate', 'sum'); - const qsByT = aggregateByStart(metrics['vllm:prefix_cache_queries']?.series, 'rate', 'sum'); + // engines, joined on start_ns. SGLang names: cached_tokens / prompt_tokens. + const hitsSeries = pickSeries('vllm:prefix_cache_hits', 'sglang:cached_tokens'); + const qsSeries = pickSeries( + 'vllm:prefix_cache_queries', + 'vllm:prompt_tokens', + 'sglang:prompt_tokens', + ); + const hitsByT = aggregateByStart(hitsSeries, 'rate', 'sum'); + const qsByT = aggregateByStart(qsSeries, 'rate', 'sum'); const prefixCacheHitRate: TimeSeriesPoint[] = []; for (const [t, h] of [...hitsByT.entries()].toSorted((a, b) => a[0] - b[0])) { const q = qsByT.get(t); @@ -239,8 +273,10 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { } // Queue depth: sum running + waiting across engines per timeslice. - const runByT = aggregateByStart(metrics['vllm:num_requests_running']?.series, 'avg', 'sum'); - const waitByT = aggregateByStart(metrics['vllm:num_requests_waiting']?.series, 'avg', 'sum'); + const runSeries = pickSeries('vllm:num_requests_running', 'sglang:num_running_reqs'); + const waitSeries = pickSeries('vllm:num_requests_waiting', 'sglang:num_queue_reqs'); + const runByT = aggregateByStart(runSeries, 'avg', 'sum'); + const waitByT = aggregateByStart(waitSeries, 'avg', 'sum'); const queueDepth: QueueDepthPoint[] = []; // Union of timestamps so we surface activity even if one of the gauges // didn't report a sample on a given tick. @@ -252,16 +288,19 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { } // Throughput: sum the counter `rate` (already per-second) across engines. - const counterRate = (name: string): TimeSeriesPoint[] => - sortedEntries(aggregateByStart(metrics[name]?.series, 'rate', 'sum')).map(([t, v]) => ({ + // Takes a fallback chain so vllm:* and sglang:* both work. + const counterRate = (...names: string[]): TimeSeriesPoint[] => { + const s = pickSeries(...names); + return sortedEntries(aggregateByStart(s, 'rate', 'sum')).map(([t, v]) => ({ t: tOf(t), value: v, })); - const prefillTps = counterRate('vllm:prompt_tokens'); - const decodeTps = counterRate('vllm:generation_tokens'); + }; + const prefillTps = counterRate('vllm:prompt_tokens', 'sglang:prompt_tokens'); + const decodeTps = counterRate('vllm:generation_tokens', 'sglang:generation_tokens'); // Tokens served from prefix cache per scrape. Lets the frontend derive // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits). - const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits'); + const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens'); // Per-source prompt tokens — sum across engines per source label. const promptBySrcByT = new Map>(); diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts index 1ad7fd7f..da5d18a0 100644 --- a/packages/db/src/queries/agentic-aggregates.ts +++ b/packages/db/src/queries/agentic-aggregates.ts @@ -32,8 +32,12 @@ import type { DbClient } from '../connection.js'; * * v2: aggregate vllm gauges/counters across all engine series (was reading * only series[0], which under-counted by Nx on multi-engine DP/PP deployments). + * + * v3: extract sglang:* metrics too — kv_cache_util + prefix_cache_hit_rate + * populate for SGLang runs (qwen3.5/h100, mi355x sglang, etc.) the same way + * they do for vllm runs. */ -export const STATS_VERSION = 2; +export const STATS_VERSION = 3; export interface MetricPercentiles { mean: number; @@ -199,6 +203,18 @@ function aggregateSeriesByStart( * Aggregates across all engine series so multi-engine DP/PP deployments are * counted correctly (previously we only read engine 0). */ +/** First metric whose series array is non-empty; supports vllm/sglang fallback. */ +function pickFirstNonEmpty( + metrics: Record, + ...names: string[] +): Series[] | undefined { + for (const name of names) { + const s = metrics[name]?.series; + if (s && s.length > 0) return s; + } + return undefined; +} + export function extractServerMetricSamples(json: string): { kvCacheUtil: number[]; prefixCacheHitRate: number[]; @@ -208,17 +224,29 @@ export function extractServerMetricSamples(json: string): { // KV cache util — per-engine gauge in [0, 1]. Average across engines so the // value stays a percentage; summing would give meaningless 0..N. - const kvSeriesAll = - metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series; + const kvSeriesAll = pickFirstNonEmpty( + metrics, + 'vllm:kv_cache_usage_perc', + 'vllm:gpu_cache_usage_perc', + 'sglang:token_usage', + ); const kvCacheUtil = [...aggregateSeriesByStart(kvSeriesAll, 'avg', 'avg').values()]; // Prefix cache hit rate per interval = Σhits.rate / Σqueries.rate across - // all engines. Sum first, then divide. - const hitsAll = - metrics['vllm:prefix_cache_hits']?.series ?? metrics['vllm:gpu_prefix_cache_hits']?.series; - const queriesAll = - metrics['vllm:prefix_cache_queries']?.series ?? - metrics['vllm:gpu_prefix_cache_queries']?.series; + // all engines. Sum first, then divide. SGLang names: cached_tokens / prompt_tokens. + const hitsAll = pickFirstNonEmpty( + metrics, + 'vllm:prefix_cache_hits', + 'vllm:gpu_prefix_cache_hits', + 'sglang:cached_tokens', + ); + const queriesAll = pickFirstNonEmpty( + metrics, + 'vllm:prefix_cache_queries', + 'vllm:gpu_prefix_cache_queries', + 'vllm:prompt_tokens', + 'sglang:prompt_tokens', + ); const hitsByT = aggregateSeriesByStart(hitsAll, 'rate', 'sum'); const qByT = aggregateSeriesByStart(queriesAll, 'rate', 'sum'); const prefixCacheHitRate: number[] = []; @@ -232,12 +260,18 @@ export function extractServerMetricSamples(json: string): { /** Metrics our aggregates pipeline cares about. Anything else in the blob is skipped. */ const TARGET_METRIC_KEYS = new Set([ + // vLLM 'vllm:kv_cache_usage_perc', - 'vllm:gpu_cache_usage_perc', // older fallback name + 'vllm:gpu_cache_usage_perc', 'vllm:prefix_cache_hits', 'vllm:prefix_cache_queries', - 'vllm:gpu_prefix_cache_hits', // legacy alias (used in pre-fix code paths) + 'vllm:gpu_prefix_cache_hits', 'vllm:gpu_prefix_cache_queries', + 'vllm:prompt_tokens', + // SGLang + 'sglang:token_usage', + 'sglang:cached_tokens', + 'sglang:prompt_tokens', ]); /** From 625d6e85e411cf8081977d3b76ad98d1805ad3c5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 20:48:58 -0500 Subject: [PATCH 50/55] fix(ingest): derive GPU cache hit rate for SGLang at ingest time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SGLang runs' harness JSON doesn't populate server_gpu_cache_hit_rate (vLLM runs do), so the detail-page header and inference chart tooltip showed "—" for SGLang points. Now at trace_replay ingest, if any of the linked benchmark_results rows has a null server_gpu_cache_hit_rate and we have non-empty prefill/hits time-series in the computed chart_series, derive the lifetime cluster ratio as sum(hits.rate) / sum(prompt.rate) and write it into the row's metrics JSONB. Already-stored SGLang rows from runs 944/945 backfilled via a one-off UPDATE earlier in this session (8 rows, mostly ~87-89% hit rate, one high-conc outlier at 2.4%). Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/trace-replay-ingest.ts | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts index 8cc03f2a..8d1e01b8 100644 --- a/packages/db/src/etl/trace-replay-ingest.ts +++ b/packages/db/src/etl/trace-replay-ingest.ts @@ -100,4 +100,23 @@ export async function insertTraceReplay( set trace_replay_id = ${traceReplayId} where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) `; + + // Derive a lifetime GPU cache hit rate from chart_series for any linked + // row whose harness JSON didn't set one (SGLang runs don't populate + // server_gpu_cache_hit_rate; vLLM runs do). Skip when chart_series has + // no usable prefill data — leaves the field null in that case, matching + // legacy "no trace_replay" behavior. + if (chartSeries && chartSeries.prefillTps.length > 0) { + const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0); + const sumHits = chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0); + if (sumPrompts > 0) { + const rate = sumHits / sumPrompts; + await sql` + update benchmark_results + set metrics = jsonb_set(metrics, '{server_gpu_cache_hit_rate}', to_jsonb(${rate}::numeric)) + where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) + and (metrics->>'server_gpu_cache_hit_rate') is null + `; + } + } } From aa76e9eca423d3ab2c7079ff28d74b70adefae1c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 28 May 2026 14:38:52 -0500 Subject: [PATCH 51/55] feat(chart-series): map sglang:realtime_tokens to promptTokensBySource MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "Cumulative prompt token source breakdown" chart was empty for SGLang runs because the vllm-specific vllm:prompt_tokens_by_source metric doesn't exist on SGLang. Maps sglang:realtime_tokens (which has mode={prefill_cache, prefill_compute, decode}) into the same source breakdown when no vllm series is present, filtered to prefill_* modes (decode tokens are output throughput, not prompt-token volume). CHART_SERIES_VERSION → 5. Backfilled 128 rows; SGLang rows from runs 944/946/947 now have prefill_cache + prefill_compute sources populated. Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/compute-chart-series.ts | 31 ++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 86b79925..0807e238 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -31,8 +31,12 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * * v4: extract sglang:* metrics too (fallback chain in each picker), so * SGLang runs populate the chart_series the same way vllm runs do. + * + * v5: map sglang:realtime_tokens (mode={prefill_cache,prefill_compute,decode}) + * into promptTokensBySource so the cumulative prompt-token-source-breakdown + * chart shows useful splits for SGLang runs (filtered to prefill_* modes). */ -export const CHART_SERIES_VERSION = 4; +export const CHART_SERIES_VERSION = 5; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -115,6 +119,7 @@ const CHART_METRIC_KEYS = new Set([ 'sglang:generation_tokens', 'sglang:num_running_reqs', 'sglang:num_queue_reqs', + 'sglang:realtime_tokens', ]); /** @@ -303,6 +308,12 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens'); // Per-source prompt tokens — sum across engines per source label. + // vllm: vllm:prompt_tokens_by_source has one series per source label + // (local_cache_hit, external_cache_hit, miss, ...). Use the + // `source`/`reason`/`kind` label as the breakdown key. + // sglang: sglang:realtime_tokens uses a `mode` label with values + // {prefill_cache, prefill_compute, decode}. Filter to prefill_* + // since decode isn't prompt-token volume. const promptBySrcByT = new Map>(); for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) { const labels = series.labels ?? {}; @@ -318,6 +329,24 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { } } } + // SGLang fallback: only consider when the vllm metric wasn't found. + if (promptBySrcByT.size === 0) { + for (const series of metrics['sglang:realtime_tokens']?.series ?? []) { + const labels = series.labels ?? {}; + const mode = labels['mode'] ?? 'unknown'; + if (!mode.startsWith('prefill')) continue; // skip 'decode' (output tokens) + let byT = promptBySrcByT.get(mode); + if (!byT) { + byT = new Map(); + promptBySrcByT.set(mode, byT); + } + for (const ts of series.timeslices ?? []) { + if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') { + byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate); + } + } + } + } const promptTokensBySource: Record = {}; for (const [source, byT] of promptBySrcByT) { const arr: TimeSeriesPoint[] = []; From 5872a3d8d3c6f5e6feee879e2f8f6f5d0ddd04ac Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 28 May 2026 14:48:27 -0500 Subject: [PATCH 52/55] feat(chart-series): break out SGLang cache hits by cache_source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously SGLang detail pages showed two stacked-area layers in the prompt-token source breakdown: prefill_cache (everything that hit the cache) + prefill_compute (cache miss). The user wanted finer granularity — specifically a distinction between on-GPU HBM cache and CPU-offloaded (hicache) host cache. SGLang's sglang:cached_tokens metric carries a cache_source label that varies per cache tier: - "device" → on-GPU HBM cache hit - "host" → CPU-offload (hicache) cache hit - "total" → older sglang, single series with no tier breakdown Switches the cache-hit portion of the breakdown from the coarse `prefill_cache` mode label to per-cache_source series: - device → "cache hit (HBM)" - host → "cache hit (CPU offload)" - total → "cache hit" - other → "cache hit ()" Cache misses still come from realtime_tokens[mode=prefill_compute], relabeled "compute (miss)" for symmetry. Current data only contains device/total (no hicache runs ingested yet) — when hicache runs come in, the chart will automatically split cache hits into HBM + CPU-offload layers with no further code change. CHART_SERIES_VERSION → 6. Backfilled 128 rows. Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/compute-chart-series.ts | 47 +++++++++++++++++++-- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 0807e238..1996708f 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -35,8 +35,13 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * v5: map sglang:realtime_tokens (mode={prefill_cache,prefill_compute,decode}) * into promptTokensBySource so the cumulative prompt-token-source-breakdown * chart shows useful splits for SGLang runs (filtered to prefill_* modes). + * + * v6: for SGLang, swap the coarse "prefill_cache" bucket for per-cache_source + * breakdown from sglang:cached_tokens — current runs always have one + * cache_source ("device" / HBM) but hicache (CPU offload) runs would + * split into "device" + "host" automatically once ingested. */ -export const CHART_SERIES_VERSION = 5; +export const CHART_SERIES_VERSION = 6; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -330,15 +335,49 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { } } // SGLang fallback: only consider when the vllm metric wasn't found. + // - Cache misses (fresh prefill): `sglang:realtime_tokens[mode=prefill_compute]` + // - Cache hits, split by tier: per-series `sglang:cached_tokens` where each + // series carries a `cache_source` label ("device" = HBM, "host" = CPU + // offload via hicache). Current runs have only `device`; when hicache + // runs land, additional series will appear and the chart will split. if (promptBySrcByT.size === 0) { for (const series of metrics['sglang:realtime_tokens']?.series ?? []) { const labels = series.labels ?? {}; const mode = labels['mode'] ?? 'unknown'; - if (!mode.startsWith('prefill')) continue; // skip 'decode' (output tokens) - let byT = promptBySrcByT.get(mode); + // Only carry the cache-miss line over — cache hits come from + // sglang:cached_tokens broken out by cache_source below, so we'd + // double-count if we kept `prefill_cache` here too. + if (mode !== 'prefill_compute') continue; + const label = 'compute (miss)'; + let byT = promptBySrcByT.get(label); + if (!byT) { + byT = new Map(); + promptBySrcByT.set(label, byT); + } + for (const ts of series.timeslices ?? []) { + if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') { + byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate); + } + } + } + // Cache hits broken out per cache_source. Strip the noisy "total" label + // (older sglang versions emit a single un-broken-out series labelled + // total — show that as just "cache hit"). + for (const series of metrics['sglang:cached_tokens']?.series ?? []) { + const labels = series.labels ?? {}; + const src = labels['cache_source'] ?? 'cache hit'; + const label = + src === 'device' + ? 'cache hit (HBM)' + : src === 'host' + ? 'cache hit (CPU offload)' + : src === 'total' + ? 'cache hit' + : `cache hit (${src})`; + let byT = promptBySrcByT.get(label); if (!byT) { byT = new Map(); - promptBySrcByT.set(mode, byT); + promptBySrcByT.set(label, byT); } for (const ts of series.timeslices ?? []) { if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') { From 94a3e8b1986e54165c062e2a14eda60d9e9dd146 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 28 May 2026 15:01:24 -0500 Subject: [PATCH 53/55] feat(chart-series): host cache util line + fix SGLang stacked-area colors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related fixes for SGLang hicache rendering on the agentic detail page: 1. KV cache utilization chart was GPU-HBM-only. SGLang hicache runs also expose sglang:hicache_host_{used,total}_tokens — the CPU offload pool's tokens-in-use over its capacity. Extracted as a new `hostKvCacheUsage` time series; frontend overlays it as a second orange line on the existing chart when the row has hicache data. 2. The cumulative-prompt-token-source-breakdown chart rendered ALL three SGLang sources in the same color, because the colors dict only knew vllm-style names (local_compute, local_cache_hit, etc.). Added explicit colors for the SGLang label names ('cache hit (HBM)', 'cache hit (CPU offload)', 'cache hit', 'compute (miss)') plus a memoized fallback palette so any future unknown source name gets a distinct color rather than falling through to gray. CHART_SERIES_VERSION → 7. Backfilled 128 rows; hicache rows from workflow_run 947 (8 rows) now have ~1830 hostKvCacheUsage samples matching their HBM samples. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/agentic-point-detail.tsx | 16 ++++++++- .../agentic-point/time-series-chart.tsx | 30 ++++++++++++++-- .../src/hooks/api/use-trace-server-metrics.ts | 2 ++ packages/db/src/etl/compute-chart-series.ts | 36 ++++++++++++++++++- .../db/src/queries/trace-server-metrics.ts | 3 ++ 5 files changed, 83 insertions(+), 4 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 2db2809b..b047ea8f 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -236,16 +236,30 @@ export function AgenticPointDetail({ id }: Props) { render={(expanded) => { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; if (!metrics) return ; + // For SGLang hicache rows we have both GPU (HBM) util and + // host (CPU offload pool) util — overlay them as two lines. + const hasHost = metrics.hostKvCacheUsage.length > 0; return ( = { + // vLLM source names local_compute: '#f97316', local_cache_hit: '#3b82f6', external_kv_transfer: '#22c55e', miss: '#f97316', + // SGLang source names (set by compute-chart-series for sglang rows) + 'cache hit (HBM)': '#3b82f6', + 'cache hit (CPU offload)': '#22c55e', + 'cache hit': '#3b82f6', + 'compute (miss)': '#f97316', }; const labelFor: Record = { local_compute: 'Prefill', @@ -496,6 +502,26 @@ export function StackedAreaChart({ external_kv_transfer: 'Offload Cache Hit', miss: 'Miss', }; + // Fallback palette for any source name not in `colors` so we never + // emit two layers in the same shade. Cycles by insertion order. + const fallbackPalette = [ + '#3b82f6', + '#f97316', + '#22c55e', + '#a855f7', + '#ef4444', + '#06b6d4', + '#f59e0b', + '#ec4899', + ]; + let fallbackIdx = 0; + const colorFor = (name: string): string => { + if (colors[name]) return colors[name]!; + const c = fallbackPalette[fallbackIdx % fallbackPalette.length]!; + fallbackIdx++; + colors[name] = c; // memoize so the SAME unknown name always gets the same color + return c; + }; if (!computed) { return ( @@ -522,7 +548,7 @@ export function StackedAreaChart({ .toReversed() .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`) .join(' ')} Z`; - const color = colors[name] ?? '#6b7280'; + const color = colorFor(name); for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!; return { name, color, d }; }); @@ -540,7 +566,7 @@ export function StackedAreaChart({ } } const items: HoverItem[] = stackOrder.map((name) => ({ - color: colors[name] ?? '#6b7280', + color: colorFor(name), label: labelFor[name] ?? name, value: `${((shares[name]?.[idx] ?? 0) * 100).toFixed(1)}%`, })); diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts index 664bc6c7..bac67a50 100644 --- a/packages/app/src/hooks/api/use-trace-server-metrics.ts +++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts @@ -44,6 +44,8 @@ export interface TraceServerMetrics { decodeTps: TimeSeriesPoint[]; /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */ prefixCacheHitsTps: TimeSeriesPoint[]; + /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */ + hostKvCacheUsage: TimeSeriesPoint[]; } async function fetchTraceServerMetrics( diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 1996708f..8105961e 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -40,8 +40,12 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * breakdown from sglang:cached_tokens — current runs always have one * cache_source ("device" / HBM) but hicache (CPU offload) runs would * split into "device" + "host" automatically once ingested. + * + * v7: extract sglang:hicache_host_{used,total}_tokens into a new + * hostKvCacheUsage series so the KV cache utilization chart can plot + * the CPU offload pool's usage alongside the on-GPU HBM line. */ -export const CHART_SERIES_VERSION = 6; +export const CHART_SERIES_VERSION = 7; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -79,6 +83,12 @@ export interface ChartSeries { * saved vs the raw queries that came in. */ prefixCacheHitsTps: TimeSeriesPoint[]; + /** + * Host (CPU offload) KV cache utilization, 0..1. Only populated for + * SGLang hicache runs (derived as hicache_host_used / hicache_host_total). + * Frontend overlays this on the KV cache util chart as a second line. + */ + hostKvCacheUsage: TimeSeriesPoint[]; } // ── Raw blob shapes (subset we read) ──────────────────────────────────── @@ -125,6 +135,8 @@ const CHART_METRIC_KEYS = new Set([ 'sglang:num_running_reqs', 'sglang:num_queue_reqs', 'sglang:realtime_tokens', + 'sglang:hicache_host_used_tokens', + 'sglang:hicache_host_total_tokens', ]); /** @@ -312,6 +324,27 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits). const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens'); + // SGLang hicache: host-pool KV cache utilization as used/total per + // timeslice. Both metrics are gauges in absolute tokens. Total stays + // constant (it's the pool size), used fluctuates. + const hostUsedByT = aggregateByStart( + metrics['sglang:hicache_host_used_tokens']?.series, + 'avg', + 'sum', + ); + const hostTotalByT = aggregateByStart( + metrics['sglang:hicache_host_total_tokens']?.series, + 'avg', + 'sum', + ); + const hostKvCacheUsage: TimeSeriesPoint[] = []; + for (const [t, used] of [...hostUsedByT.entries()].toSorted((a, b) => a[0] - b[0])) { + const total = hostTotalByT.get(t); + if (total !== undefined && total > 0) { + hostKvCacheUsage.push({ t: tOf(t), value: used / total }); + } + } + // Per-source prompt tokens — sum across engines per source label. // vllm: vllm:prompt_tokens_by_source has one series per source label // (local_cache_hit, external_cache_hit, miss, ...). Use the @@ -407,5 +440,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { prefillTps, decodeTps, prefixCacheHitsTps, + hostKvCacheUsage, }; } diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts index 76775e77..eccb0a0c 100644 --- a/packages/db/src/queries/trace-server-metrics.ts +++ b/packages/db/src/queries/trace-server-metrics.ts @@ -73,6 +73,8 @@ export interface TraceServerMetrics { decodeTps: TimeSeriesPoint[]; /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */ prefixCacheHitsTps: TimeSeriesPoint[]; + /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */ + hostKvCacheUsage: TimeSeriesPoint[]; } interface RawMetaRow extends PointMeta { @@ -118,6 +120,7 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics { decodeTps: series.decodeTps, // v2 chart_series rows pre-backfill don't have this field — default to [] prefixCacheHitsTps: series.prefixCacheHitsTps ?? [], + hostKvCacheUsage: series.hostKvCacheUsage ?? [], }; } From 93e197b7e54d140acfe65b61aeb4f5c48ca27091 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 28 May 2026 15:19:20 -0500 Subject: [PATCH 54/55] fix(stacked-area): align sources by timestamp before computing shares MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cumulative-prompt-token-source-breakdown chart was showing huge "100% compute (miss)" plateaus around minute 20-24 of many SGLang runs. Root cause: the chart computed cumulative shares per ARRAY INDEX (not timestamp), but in SGLang's per-scrape metrics, cache hits and misses fire on different ticks — one scrape reports 193K hits + 0 miss, the next reports 0 hits + 8K miss. So each source has a different timestamp array. Indexing them in lockstep mixed values from different moments and made the share calculation flap to 100% one side or the other. Fix: union timestamps across all sources, then for each unique timestamp carry forward each source's cumulative sum (a source that didn't report at time t holds its previous cumulative value rather than appearing as 0). After fix: shares change smoothly over time as each source's cumulative sum grows; transient single-tick gaps no longer drive the visible share to either extreme. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/time-series-chart.tsx | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx index 15a15869..75d7bb1e 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -464,15 +464,36 @@ export function StackedAreaChart({ const computed = useMemo(() => { const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0); if (entries.length === 0) return null; - const tValues = entries[0]![1].map((p) => p.t); + + // Different sources can land on different scrape timestamps + // (SGLang's hits/misses fire on alternating ticks), so we MUST + // align across all sources before computing shares — otherwise the + // share calculation indexes into each source's own time axis and + // mixes values from different moments. + // + // Approach: union all timestamps across sources, then for each + // unique timestamp carry forward the cumulative sum for every + // source (a source that didn't report at time t holds its previous + // cumulative value rather than dropping to 0). + const tValues = [...new Set(entries.flatMap(([, arr]) => arr.map((p) => p.t)))].toSorted( + (a, b) => a - b, + ); + + // For each source, walk its (sorted) array and produce a parallel + // cumulative-sum array indexed against `tValues` via carry-forward. const cum: Record = {}; for (const [name, arr] of entries) { + const valByT = new Map(arr.map((p) => [p.t, p.value])); + const out: number[] = Array.from({ length: tValues.length }); let acc = 0; - cum[name] = arr.map((p) => { - acc += p.value; - return acc; - }); + for (let i = 0; i < tValues.length; i++) { + const v = valByT.get(tValues[i]!); + if (v !== undefined) acc += v; + out[i] = acc; + } + cum[name] = out; } + const shares: Record = {}; for (const name of Object.keys(cum)) shares[name] = []; for (let i = 0; i < tValues.length; i++) { From c14e19e277930495e4a43c3a6d6f42a611fec336 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 28 May 2026 15:44:07 -0500 Subject: [PATCH 55/55] fix(ingest): split GPU vs CPU cache hit rate for SGLang hicache rows Previous inline derivation (commit 625d6e8) summed ALL cache hit sources into server_gpu_cache_hit_rate, which conflated GPU HBM hits with CPU offload hits on SGLang hicache rows. The harness JSON also never sets server_cpu_cache_hit_rate. Now derives both metrics from chart_series.promptTokensBySource: server_gpu_cache_hit_rate = sum(HBM + 'cache hit') / sum(prompts) server_cpu_cache_hit_rate = sum(CPU offload) / sum(prompts) or null (null when no CPU offload source exists) Falls back to prefixCacheHitsTps for vLLM rows where promptTokensBySource isn't broken out by cache source. Overwrites any pre-existing value so the derivation stays consistent with what the detail-page charts plot. Backfilled all existing rows via two-phase SQL update earlier in the session: - 8 hicache rows in workflow_run 947 now show GPU ~1-2% / CPU ~87-91% - Other SGLang rows show GPU ~87% / CPU null - vLLM rows restored to their original GPU hit rates Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/trace-replay-ingest.ts | 40 +++++++++++++++++----- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts index 8d1e01b8..43655d9a 100644 --- a/packages/db/src/etl/trace-replay-ingest.ts +++ b/packages/db/src/etl/trace-replay-ingest.ts @@ -101,21 +101,43 @@ export async function insertTraceReplay( where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) `; - // Derive a lifetime GPU cache hit rate from chart_series for any linked - // row whose harness JSON didn't set one (SGLang runs don't populate - // server_gpu_cache_hit_rate; vLLM runs do). Skip when chart_series has - // no usable prefill data — leaves the field null in that case, matching - // legacy "no trace_replay" behavior. + // Derive lifetime GPU + CPU cache hit rates from chart_series. SGLang + // runs don't populate these in the harness JSON; vLLM runs do but only + // for GPU. We always recompute to keep the derivation consistent with + // what the detail-page charts plot — overwriting any pre-existing value. + // + // For hicache (CPU offload) rows the chart_series.promptTokensBySource + // breakdown has separate "cache hit (HBM)" + "cache hit (CPU offload)" + // sources, letting us split GPU vs CPU hit rate. Other rows just have + // a single cache-hit source (either "cache hit (HBM)" / "cache hit" + // for sglang, or no breakdown for vllm — falls back to prefixCacheHitsTps + // sum which equals the single cache source's total). if (chartSeries && chartSeries.prefillTps.length > 0) { const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0); - const sumHits = chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0); if (sumPrompts > 0) { - const rate = sumHits / sumPrompts; + const sumOf = (name: string): number => + (chartSeries.promptTokensBySource[name] ?? []).reduce((s, p) => s + p.value, 0); + const cpuHits = sumOf('cache hit (CPU offload)'); + const hbmFromBreakdown = sumOf('cache hit (HBM)') + sumOf('cache hit'); + // If the source breakdown has a HBM entry, use it (covers SGLang). + // Otherwise fall back to total prefixCacheHitsTps sum (vLLM path). + const gpuHits = + hbmFromBreakdown > 0 + ? hbmFromBreakdown + : chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0); + const gpuRate = gpuHits / sumPrompts; + const cpuRate = cpuHits > 0 ? cpuHits / sumPrompts : null; await sql` update benchmark_results - set metrics = jsonb_set(metrics, '{server_gpu_cache_hit_rate}', to_jsonb(${rate}::numeric)) + set metrics = jsonb_set( + case when ${cpuRate}::numeric is not null + then jsonb_set(metrics, '{server_cpu_cache_hit_rate}', to_jsonb(${cpuRate}::numeric)) + else metrics + end, + '{server_gpu_cache_hit_rate}', + to_jsonb(${gpuRate}::numeric) + ) where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) - and (metrics->>'server_gpu_cache_hit_rate') is null `; } }