CodebuffAI · jahooma · Apr 19, 2026
@@ -30,3 +30,4 @@ debug/
 
 # Infisical config (user-specific)
 .infisical.json
+.gstack/
@@ -45,19 +45,35 @@ describe('Fireworks deployment routing', () => {
     })
 
     it('isDeploymentCoolingDown returns false initially', () => {
-      expect(isDeploymentCoolingDown()).toBe(false)
+      expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false)
     })
 
     it('isDeploymentCoolingDown returns true after markDeploymentScalingUp', () => {
-      markDeploymentScalingUp()
-      expect(isDeploymentCoolingDown()).toBe(true)
+      markDeploymentScalingUp(DEPLOYMENT_MODEL_ID)
+      expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true)
     })
 
     it('isDeploymentCoolingDown returns false after resetDeploymentCooldown', () => {
-      markDeploymentScalingUp()
-      expect(isDeploymentCoolingDown()).toBe(true)
+      markDeploymentScalingUp(DEPLOYMENT_MODEL_ID)
+      expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true)
       resetDeploymentCooldown()
-      expect(isDeploymentCoolingDown()).toBe(false)
+      expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false)
+    })
+
+    it('cooldown on one deployment does not affect other deployments', () => {
+      const otherDeployment = 'accounts/james-65d217/deployments/other123'
+      markDeploymentScalingUp(DEPLOYMENT_MODEL_ID)
+      expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true)
+      expect(isDeploymentCoolingDown(otherDeployment)).toBe(false)
+    })
+
+    it('resetDeploymentCooldown with an id only clears that deployment', () => {
+      const otherDeployment = 'accounts/james-65d217/deployments/other123'
+      markDeploymentScalingUp(DEPLOYMENT_MODEL_ID)
+      markDeploymentScalingUp(otherDeployment)
+      resetDeploymentCooldown(DEPLOYMENT_MODEL_ID)
+      expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false)
+      expect(isDeploymentCoolingDown(otherDeployment)).toBe(true)
     })
 
     it('DEPLOYMENT_COOLDOWN_MS is 2 minutes', () => {
@@ -195,8 +211,8 @@ describe('Fireworks deployment routing', () => {
         expect(fetchCalls).toHaveLength(2)
         expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
         expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID)
-        // Verify cooldown was activated
-        expect(isDeploymentCoolingDown()).toBe(true)
+        // Verify cooldown was activated for this specific deployment
+        expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true)
       } finally {
         spy.restore()
       }
@@ -243,7 +259,7 @@ describe('Fireworks deployment routing', () => {
         expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
         expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID)
         // Non-scaling 503 should NOT activate the cooldown
-        expect(isDeploymentCoolingDown()).toBe(false)
+        expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false)
       } finally {
         spy.restore()
       }
@@ -283,15 +299,15 @@ describe('Fireworks deployment routing', () => {
         expect(fetchCalls).toHaveLength(2)
         expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
         expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID)
-        expect(isDeploymentCoolingDown()).toBe(false)
+        expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false)
       } finally {
         spy.restore()
       }
     })
 
     it('skips deployment during cooldown and goes straight to standard API', async () => {
       const spy = spyDeploymentHours(true)
-      markDeploymentScalingUp()
+      markDeploymentScalingUp(DEPLOYMENT_MODEL_ID)
 
       const fetchCalls: string[] = []
       const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {

@@ -44,22 +44,32 @@ export function isDeploymentHours(_now: Date = new Date()): boolean {
 }
 
 /**
- * In-memory cooldown to avoid repeatedly hitting a deployment that is scaling up.
- * After a DEPLOYMENT_SCALING_UP 503, we skip the deployment for this many ms.
+ * In-memory per-deployment cooldown to avoid repeatedly hitting a deployment
+ * that is scaling up. After a DEPLOYMENT_SCALING_UP 503, we skip that
+ * deployment for this many ms. Keyed by full deployment path so one
+ * deployment's cooldown doesn't affect routing to other deployments.
  */
 export const DEPLOYMENT_COOLDOWN_MS = 2 * 60 * 1000
-let deploymentScalingUpUntil = 0
-
-export function isDeploymentCoolingDown(): boolean {
-  return Date.now() < deploymentScalingUpUntil
+const deploymentCooldowns = new Map<string, number>()
+
+export function isDeploymentCoolingDown(deploymentId: string): boolean {
+  const until = deploymentCooldowns.get(deploymentId)
+  if (until === undefined) return false
+  if (Date.now() < until) return true
+  deploymentCooldowns.delete(deploymentId)
+  return false
 }
 
-export function markDeploymentScalingUp(): void {
-  deploymentScalingUpUntil = Date.now() + DEPLOYMENT_COOLDOWN_MS
+export function markDeploymentScalingUp(deploymentId: string): void {
+  deploymentCooldowns.set(deploymentId, Date.now() + DEPLOYMENT_COOLDOWN_MS)
 }
 
-export function resetDeploymentCooldown(): void {
-  deploymentScalingUpUntil = 0
+export function resetDeploymentCooldown(deploymentId?: string): void {
+  if (deploymentId === undefined) {
+    deploymentCooldowns.clear()
+    return
+  }
+  deploymentCooldowns.delete(deploymentId)
 }
 
 export function isFireworksModel(model: string): boolean {
@@ -725,9 +735,9 @@ export async function createFireworksRequestWithFallback(params: {
   const deploymentModelId = FIREWORKS_DEPLOYMENT_MAP[originalModel]
   const shouldTryDeployment =
     useCustomDeployment &&
-    deploymentModelId &&
+    !!deploymentModelId &&
     isDeploymentHours() &&
-    !isDeploymentCoolingDown()
+    !isDeploymentCoolingDown(deploymentModelId)
 
   if (shouldTryDeployment) {
     logger.info(
@@ -749,7 +759,7 @@ export async function createFireworksRequestWithFallback(params: {
         'Fireworks custom deployment returned 5xx, falling back to standard API',
       )
       if (errorText.includes('DEPLOYMENT_SCALING_UP')) {
-        markDeploymentScalingUp()
+        markDeploymentScalingUp(deploymentModelId)
       }
       // Fall through to standard API request below
     } else {

@@ -108,11 +108,43 @@ describe('fireworks health classifier', () => {
     expect(classify([], [DEPLOY])).toBe('healthy')
   })
 
-  test('worst-of across multiple deployments — unhealthy wins over degraded', () => {
+  test('healthy if any deployment is healthy (one deployment per model, users route per-model)', () => {
     const other = 'other123'
     const samples: PromSample[] = [
+      // DEPLOY is healthy
+      kvBlocks(0.5),
+      ...prefillQueueBuckets(150),
+      // other is unhealthy
+      {
+        name: 'generator_kv_blocks_fraction:avg_by_deployment',
+        labels: { deployment_id: other },
+        value: KV_BLOCKS_UNHEALTHY_FRACTION + 0.005,
+      },
+    ]
+    expect(classify(samples, [DEPLOY, other])).toBe('healthy')
+  })
+
+  test('degraded when all deployments are non-healthy and at least one is degraded', () => {
+    const other = 'other123'
+    const samples: PromSample[] = [
+      // DEPLOY is degraded (prefill queue over threshold)
       kvBlocks(0.5),
       ...prefillQueueBuckets(PREFILL_QUEUE_P90_DEGRADED_MS + 500),
+      // other is unhealthy (KV backstop)
+      {
+        name: 'generator_kv_blocks_fraction:avg_by_deployment',
+        labels: { deployment_id: other },
+        value: KV_BLOCKS_UNHEALTHY_FRACTION + 0.005,
+      },
+    ]
+    expect(classify(samples, [DEPLOY, other])).toBe('degraded')
+  })
+
+  test('unhealthy only when every deployment is unhealthy', () => {
+    const other = 'other123'
+    const samples: PromSample[] = [
+      kvBlocks(KV_BLOCKS_UNHEALTHY_FRACTION + 0.005),
+      ...prefillQueueBuckets(300),
       {
         name: 'generator_kv_blocks_fraction:avg_by_deployment',
         labels: { deployment_id: other },

@@ -110,18 +110,27 @@ async function probe(): Promise<FireworksHealth> {
   return classify(samples, deploymentIds)
 }
 
-/** Treat the whole fleet as degraded/unhealthy if any single deployment is. */
+/** Admit if ANY deployment is healthy. One deployment per model (and per
+ *  region in the future) means a user's request routes to a specific
+ *  deployment based on their chosen model — a degraded or unhealthy
+ *  deployment for one model doesn't affect users whose model routes
+ *  elsewhere, and `createFireworksRequestWithFallback` falls back to the
+ *  standard Fireworks API on 5xx regardless. Only hold the queue when ALL
+ *  deployments are non-healthy. Degraded beats unhealthy so
+ *  observability/logs show we still have upstream reachable. */
 export function classify(
   samples: PromSample[],
   deploymentIds: string[],
 ): FireworksHealth {
-  let worst: FireworksHealth = 'healthy'
+  if (deploymentIds.length === 0) return 'healthy'
+
+  let anyDegraded = false
   for (const deploymentId of deploymentIds) {
     const h = classifyOne(samples, deploymentId)
-    if (h === 'unhealthy') return 'unhealthy'
-    if (h === 'degraded') worst = 'degraded'
+    if (h === 'healthy') return 'healthy'
+    if (h === 'degraded') anyDegraded = true
   }
-  return worst
+  return anyDegraded ? 'degraded' : 'unhealthy'
 }
 
 function classifyOne(samples: PromSample[], deploymentId: string): FireworksHealth {
Original file line number	Diff line number	Diff line change
Expand Up		@@ -30,3 +30,4 @@ debug/

		# Infisical config (user-specific)
		.infisical.json
		.gstack/