Detect cold Fireworks deployments; tighten TTFT/queue thresholds

jahooma · claude · jahooma · commit 4a0efb8470c7 · 2026-04-18T16:28:23.000-07:00
Replicas=0 or no replicas metric at all (the deployment has been scaled
to zero or dropped from the scrape) now flips that deployment's health
to unhealthy unconditionally, so admission fails closed instead of
funneling users to a backend that cannot serve traffic. Also drop
generationQueueMs degraded 5000 -&gt; 400 and ttftMs degraded 8000 -&gt; 2000,
and comment out the kimi deployment since only glm-5.1 is in production.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/scripts/check-fireworks-health.ts b/scripts/check-fireworks-health.ts
@@ -110,6 +110,7 @@ async function main() {
     console.log(`── ${color}${health.status.toUpperCase().padEnd(9)}${RESET} ${model}`)
     console.log(`   deployment:            ${deployment}`)
     console.log(`   base model:            ${health.baseModel ?? 'n/a'}`)
+    console.log(`   replicas:              ${health.metrics.replicas ?? 'n/a'}`)
     console.log(`   request rate:          ${health.metrics.requestRate.toFixed(3)} req/s`)
     console.log(`   error rate:            ${health.metrics.errorRate.toFixed(3)} err/s (${formatPct(health.metrics.errorFraction)})`)
     console.log(`   concurrent requests:   ${health.metrics.concurrentRequests.toFixed(2)}`)
diff --git a/web/src/llm-api/fireworks-config.ts b/web/src/llm-api/fireworks-config.ts
@@ -10,6 +10,6 @@ export const FIREWORKS_ACCOUNT_ID = 'james-65d217'
 
 export const FIREWORKS_DEPLOYMENT_MAP: Record<string, string> = {
   // 'minimax/minimax-m2.5': 'accounts/james-65d217/deployments/lnfid5h9',
-  'moonshotai/kimi-k2.5': 'accounts/james-65d217/deployments/mx8l5rq2',
+  // 'moonshotai/kimi-k2.5': 'accounts/james-65d217/deployments/mx8l5rq2',
   'z-ai/glm-5.1': 'accounts/james-65d217/deployments/mjb4i7ea',
 }
diff --git a/web/src/server/fireworks-monitor/__tests__/compute-health.test.ts b/web/src/server/fireworks-monitor/__tests__/compute-health.test.ts
@@ -18,9 +18,18 @@ function fixture(params: {
   kvSlots?: number
   queueBuckets?: Array<{ le: string; count: number }>
   ttftBuckets?: Array<{ le: string; count: number }>
+  /** deployment_replicas gauge. Defaults to 1 so existing tests stay healthy.
+   *  Set to 0 or null to simulate a cold/deleted deployment. */
+  replicas?: number | null
 }): string {
   const lines: string[] = []
   const labels = `base_model="m",deployment="${DEPLOYMENT}",deployment_account="test-acc",deployment_id="d1"`
+  const replicas = params.replicas === undefined ? 1 : params.replicas
+  if (replicas !== null) {
+    lines.push(
+      `deployment_replicas{deployment_account="test-acc",deployment_id="d1"} ${replicas}`,
+    )
+  }
   if (params.requestRate !== undefined) {
     lines.push(`request_counter_total:sum_by_deployment{${labels}} ${params.requestRate}`)
   }
@@ -182,9 +191,38 @@ describe('computeDeploymentHealth', () => {
     expect(health.reasons.some((r) => r.includes('error rate'))).toBe(true)
   })
 
+  test('flags deployment with zero replicas as unhealthy', () => {
+    const metrics = parsePrometheusText(
+      fixture({ requestRate: 0, errorRate: 0, kvBlocks: 0, replicas: 0 }),
+    )
+    const health = computeDeploymentHealth({
+      deployment: DEPLOYMENT,
+      metrics,
+      thresholds: DEFAULT_HEALTH_THRESHOLDS,
+    })
+    expect(health.status).toBe('unhealthy')
+    expect(health.metrics.replicas).toBe(0)
+    expect(health.reasons.some((r) => r.includes('replicas'))).toBe(true)
+  })
+
+  test('flags deployment with no replicas metric as unhealthy (cold / deleted)', () => {
+    const metrics = parsePrometheusText(
+      fixture({ requestRate: 0, errorRate: 0, kvBlocks: 0, replicas: null }),
+    )
+    const health = computeDeploymentHealth({
+      deployment: DEPLOYMENT,
+      metrics,
+      thresholds: DEFAULT_HEALTH_THRESHOLDS,
+    })
+    expect(health.status).toBe('unhealthy')
+    expect(health.metrics.replicas).toBeNull()
+    expect(health.reasons.some((r) => r.includes('cold or deleted'))).toBe(true)
+  })
+
   test('sums error counters across multiple HTTP codes', () => {
     const labels = `base_model="m",deployment="${DEPLOYMENT}",deployment_id="d1"`
     const text = [
+      `deployment_replicas{deployment_account="test-acc",deployment_id="d1"} 1`,
       `request_counter_total:sum_by_deployment{${labels}} 100`,
       `requests_error_total:sum_by_deployment{${labels},http_code="500"} 3`,
       `requests_error_total:sum_by_deployment{${labels},http_code="429"} 5`,
@@ -231,9 +269,11 @@ describe('computeSnapshot', () => {
   test('overall status is the worst across deployments', () => {
     const dep2 = 'accounts/test-acc/deployments/d2'
     const text = [
+      `deployment_replicas{deployment_id="d1"} 1`,
       `request_counter_total:sum_by_deployment{deployment="${DEPLOYMENT}",deployment_id="d1"} 100`,
       `requests_error_total:sum_by_deployment{deployment="${DEPLOYMENT}",deployment_id="d1",http_code="500"} 0`,
       `generator_kv_blocks_fraction:avg_by_deployment{deployment="${DEPLOYMENT}",deployment_id="d1"} 0.1`,
+      `deployment_replicas{deployment_id="d2"} 1`,
       `request_counter_total:sum_by_deployment{deployment="${dep2}",deployment_id="d2"} 100`,
       `requests_error_total:sum_by_deployment{deployment="${dep2}",deployment_id="d2",http_code="500"} 30`,
       `generator_kv_blocks_fraction:avg_by_deployment{deployment="${dep2}",deployment_id="d2"} 0.1`,
diff --git a/web/src/server/fireworks-monitor/__tests__/monitor.test.ts b/web/src/server/fireworks-monitor/__tests__/monitor.test.ts
@@ -17,6 +17,7 @@ afterEach(() => {
 const DEPLOYMENT = 'accounts/test-acc/deployments/d1'
 
 const HEALTHY_BODY = [
+  `deployment_replicas{deployment_id="d1"} 1`,
   `request_counter_total:sum_by_deployment{deployment="${DEPLOYMENT}",deployment_id="d1"} 10`,
   `requests_error_total:sum_by_deployment{deployment="${DEPLOYMENT}",deployment_id="d1",http_code="500"} 0`,
   `generator_kv_blocks_fraction:avg_by_deployment{deployment="${DEPLOYMENT}",deployment_id="d1"} 0.1`,
diff --git a/web/src/server/fireworks-monitor/compute-health.ts b/web/src/server/fireworks-monitor/compute-health.ts
@@ -49,9 +49,9 @@ export const DEFAULT_HEALTH_THRESHOLDS: HealthThresholds = {
   errorFractionUnhealthy: 0.1,
   kvBlocksFractionDegraded: 0.95,
   kvBlocksFractionUnhealthy: 0.99,
-  generationQueueMsDegraded: 5_000,
+  generationQueueMsDegraded: 400,
   generationQueueMsUnhealthy: 15_000,
-  ttftMsDegraded: 8_000,
+  ttftMsDegraded: 2_000,
   ttftMsUnhealthy: 30_000,
 }
 
@@ -69,6 +69,15 @@ export function computeDeploymentHealth(params: {
 }): DeploymentHealth {
   const { deployment, metrics, thresholds } = params
   const filter = { deployment }
+  const deploymentId = parseDeploymentId(deployment)
+
+  // `deployment_replicas` is keyed by deployment_id (not the full deployment
+  // path). Zero or missing replicas means the deployment is cold / scaled to
+  // zero / deleted — admission must fail closed in that case.
+  const replicasSamples = findSamples(metrics, 'deployment_replicas', {
+    deployment_id: deploymentId,
+  })
+  const replicas = replicasSamples.length > 0 ? sumSamples(replicasSamples) : null
 
   const requestRateSamples = findSamples(
     metrics,
@@ -121,7 +130,6 @@ export function computeDeploymentHealth(params: {
     ...errorRateSamples,
   ].find((s) => s.labels.base_model)
   const baseModel = baseModelSample?.labels.base_model ?? null
-  const deploymentId = baseModelSample?.labels.deployment_id ?? parseDeploymentId(deployment)
 
   const reasons: string[] = []
   let status: DeploymentHealthStatus = 'healthy'
@@ -130,6 +138,18 @@ export function computeDeploymentHealth(params: {
     if (STATUS_RANK[next] > STATUS_RANK[status]) status = next
   }
 
+  // A deployment with no running replicas cannot serve traffic. Treat as
+  // unhealthy unconditionally so admission stops funneling users to a cold
+  // backend. Missing gauge (`replicas === null`) is the strongest signal
+  // Fireworks has dropped the deployment from its scrape entirely.
+  if (replicas === null) {
+    reasons.push('no replicas metric — deployment cold or deleted')
+    upgrade('unhealthy')
+  } else if (replicas <= 0) {
+    reasons.push(`replicas=${replicas}`)
+    upgrade('unhealthy')
+  }
+
   if (requestRate >= thresholds.minRequestRateForErrorCheck) {
     if (errorFraction >= thresholds.errorFractionUnhealthy) {
       reasons.push(`error rate ${(errorFraction * 100).toFixed(1)}% ≥ ${(thresholds.errorFractionUnhealthy * 100).toFixed(1)}%`)
@@ -175,6 +195,7 @@ export function computeDeploymentHealth(params: {
     status,
     reasons,
     metrics: {
+      replicas,
       requestRate,
       errorRate,
       errorFraction,
@@ -223,6 +244,7 @@ export function computeSnapshot(params: {
         status: 'unknown',
         reasons: ['no scrape yet'],
         metrics: {
+          replicas: null,
           requestRate: 0,
           errorRate: 0,
           errorFraction: 0,
diff --git a/web/src/server/fireworks-monitor/types.ts b/web/src/server/fireworks-monitor/types.ts
@@ -18,6 +18,9 @@ export interface DeploymentHealth {
   status: DeploymentHealthStatus
   reasons: string[]
   metrics: {
+    /** null when Fireworks doesn't emit a deployment_replicas gauge for the
+     *  deployment (cold / deleted / not-yet-scraped). 0 means scaled-to-zero. */
+    replicas: number | null
     requestRate: number
     errorRate: number
     errorFraction: number

Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,6 @@ export const FIREWORKS_ACCOUNT_ID = 'james-65d217'`
`10`	`10`
`11`	`11`	`export const FIREWORKS_DEPLOYMENT_MAP: Record<string, string> = {`
`12`	`12`	`// 'minimax/minimax-m2.5': 'accounts/james-65d217/deployments/lnfid5h9',`
`13`		`- 'moonshotai/kimi-k2.5': 'accounts/james-65d217/deployments/mx8l5rq2',`
	`13`	`+ // 'moonshotai/kimi-k2.5': 'accounts/james-65d217/deployments/mx8l5rq2',`
`14`	`14`	`'z-ai/glm-5.1': 'accounts/james-65d217/deployments/mjb4i7ea',`
`15`	`15`	`}`