diff --git a/.gitignore b/.gitignore index 1e67aef11..febe25045 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ debug/ # Infisical config (user-specific) .infisical.json +.gstack/ diff --git a/web/src/llm-api/__tests__/fireworks-deployment.test.ts b/web/src/llm-api/__tests__/fireworks-deployment.test.ts index 9ed91fd0a..07b32cf14 100644 --- a/web/src/llm-api/__tests__/fireworks-deployment.test.ts +++ b/web/src/llm-api/__tests__/fireworks-deployment.test.ts @@ -45,19 +45,35 @@ describe('Fireworks deployment routing', () => { }) it('isDeploymentCoolingDown returns false initially', () => { - expect(isDeploymentCoolingDown()).toBe(false) + expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false) }) it('isDeploymentCoolingDown returns true after markDeploymentScalingUp', () => { - markDeploymentScalingUp() - expect(isDeploymentCoolingDown()).toBe(true) + markDeploymentScalingUp(DEPLOYMENT_MODEL_ID) + expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true) }) it('isDeploymentCoolingDown returns false after resetDeploymentCooldown', () => { - markDeploymentScalingUp() - expect(isDeploymentCoolingDown()).toBe(true) + markDeploymentScalingUp(DEPLOYMENT_MODEL_ID) + expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true) resetDeploymentCooldown() - expect(isDeploymentCoolingDown()).toBe(false) + expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false) + }) + + it('cooldown on one deployment does not affect other deployments', () => { + const otherDeployment = 'accounts/james-65d217/deployments/other123' + markDeploymentScalingUp(DEPLOYMENT_MODEL_ID) + expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true) + expect(isDeploymentCoolingDown(otherDeployment)).toBe(false) + }) + + it('resetDeploymentCooldown with an id only clears that deployment', () => { + const otherDeployment = 'accounts/james-65d217/deployments/other123' + markDeploymentScalingUp(DEPLOYMENT_MODEL_ID) + markDeploymentScalingUp(otherDeployment) + resetDeploymentCooldown(DEPLOYMENT_MODEL_ID) + expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false) + expect(isDeploymentCoolingDown(otherDeployment)).toBe(true) }) it('DEPLOYMENT_COOLDOWN_MS is 2 minutes', () => { @@ -195,8 +211,8 @@ describe('Fireworks deployment routing', () => { expect(fetchCalls).toHaveLength(2) expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID) - // Verify cooldown was activated - expect(isDeploymentCoolingDown()).toBe(true) + // Verify cooldown was activated for this specific deployment + expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true) } finally { spy.restore() } @@ -243,7 +259,7 @@ describe('Fireworks deployment routing', () => { expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID) // Non-scaling 503 should NOT activate the cooldown - expect(isDeploymentCoolingDown()).toBe(false) + expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false) } finally { spy.restore() } @@ -283,7 +299,7 @@ describe('Fireworks deployment routing', () => { expect(fetchCalls).toHaveLength(2) expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID) expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID) - expect(isDeploymentCoolingDown()).toBe(false) + expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false) } finally { spy.restore() } @@ -291,7 +307,7 @@ describe('Fireworks deployment routing', () => { it('skips deployment during cooldown and goes straight to standard API', async () => { const spy = spyDeploymentHours(true) - markDeploymentScalingUp() + markDeploymentScalingUp(DEPLOYMENT_MODEL_ID) const fetchCalls: string[] = [] const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => { diff --git a/web/src/llm-api/fireworks.ts b/web/src/llm-api/fireworks.ts index 6e304638d..e2202397b 100644 --- a/web/src/llm-api/fireworks.ts +++ b/web/src/llm-api/fireworks.ts @@ -44,22 +44,32 @@ export function isDeploymentHours(_now: Date = new Date()): boolean { } /** - * In-memory cooldown to avoid repeatedly hitting a deployment that is scaling up. - * After a DEPLOYMENT_SCALING_UP 503, we skip the deployment for this many ms. + * In-memory per-deployment cooldown to avoid repeatedly hitting a deployment + * that is scaling up. After a DEPLOYMENT_SCALING_UP 503, we skip that + * deployment for this many ms. Keyed by full deployment path so one + * deployment's cooldown doesn't affect routing to other deployments. */ export const DEPLOYMENT_COOLDOWN_MS = 2 * 60 * 1000 -let deploymentScalingUpUntil = 0 - -export function isDeploymentCoolingDown(): boolean { - return Date.now() < deploymentScalingUpUntil +const deploymentCooldowns = new Map() + +export function isDeploymentCoolingDown(deploymentId: string): boolean { + const until = deploymentCooldowns.get(deploymentId) + if (until === undefined) return false + if (Date.now() < until) return true + deploymentCooldowns.delete(deploymentId) + return false } -export function markDeploymentScalingUp(): void { - deploymentScalingUpUntil = Date.now() + DEPLOYMENT_COOLDOWN_MS +export function markDeploymentScalingUp(deploymentId: string): void { + deploymentCooldowns.set(deploymentId, Date.now() + DEPLOYMENT_COOLDOWN_MS) } -export function resetDeploymentCooldown(): void { - deploymentScalingUpUntil = 0 +export function resetDeploymentCooldown(deploymentId?: string): void { + if (deploymentId === undefined) { + deploymentCooldowns.clear() + return + } + deploymentCooldowns.delete(deploymentId) } export function isFireworksModel(model: string): boolean { @@ -725,9 +735,9 @@ export async function createFireworksRequestWithFallback(params: { const deploymentModelId = FIREWORKS_DEPLOYMENT_MAP[originalModel] const shouldTryDeployment = useCustomDeployment && - deploymentModelId && + !!deploymentModelId && isDeploymentHours() && - !isDeploymentCoolingDown() + !isDeploymentCoolingDown(deploymentModelId) if (shouldTryDeployment) { logger.info( @@ -749,7 +759,7 @@ export async function createFireworksRequestWithFallback(params: { 'Fireworks custom deployment returned 5xx, falling back to standard API', ) if (errorText.includes('DEPLOYMENT_SCALING_UP')) { - markDeploymentScalingUp() + markDeploymentScalingUp(deploymentModelId) } // Fall through to standard API request below } else { diff --git a/web/src/server/free-session/__tests__/fireworks-health.test.ts b/web/src/server/free-session/__tests__/fireworks-health.test.ts index 3475769cd..40adeef62 100644 --- a/web/src/server/free-session/__tests__/fireworks-health.test.ts +++ b/web/src/server/free-session/__tests__/fireworks-health.test.ts @@ -108,11 +108,43 @@ describe('fireworks health classifier', () => { expect(classify([], [DEPLOY])).toBe('healthy') }) - test('worst-of across multiple deployments — unhealthy wins over degraded', () => { + test('healthy if any deployment is healthy (one deployment per model, users route per-model)', () => { const other = 'other123' const samples: PromSample[] = [ + // DEPLOY is healthy + kvBlocks(0.5), + ...prefillQueueBuckets(150), + // other is unhealthy + { + name: 'generator_kv_blocks_fraction:avg_by_deployment', + labels: { deployment_id: other }, + value: KV_BLOCKS_UNHEALTHY_FRACTION + 0.005, + }, + ] + expect(classify(samples, [DEPLOY, other])).toBe('healthy') + }) + + test('degraded when all deployments are non-healthy and at least one is degraded', () => { + const other = 'other123' + const samples: PromSample[] = [ + // DEPLOY is degraded (prefill queue over threshold) kvBlocks(0.5), ...prefillQueueBuckets(PREFILL_QUEUE_P90_DEGRADED_MS + 500), + // other is unhealthy (KV backstop) + { + name: 'generator_kv_blocks_fraction:avg_by_deployment', + labels: { deployment_id: other }, + value: KV_BLOCKS_UNHEALTHY_FRACTION + 0.005, + }, + ] + expect(classify(samples, [DEPLOY, other])).toBe('degraded') + }) + + test('unhealthy only when every deployment is unhealthy', () => { + const other = 'other123' + const samples: PromSample[] = [ + kvBlocks(KV_BLOCKS_UNHEALTHY_FRACTION + 0.005), + ...prefillQueueBuckets(300), { name: 'generator_kv_blocks_fraction:avg_by_deployment', labels: { deployment_id: other }, diff --git a/web/src/server/free-session/fireworks-health.ts b/web/src/server/free-session/fireworks-health.ts index 7d8e115e4..1c09eb8e2 100644 --- a/web/src/server/free-session/fireworks-health.ts +++ b/web/src/server/free-session/fireworks-health.ts @@ -110,18 +110,27 @@ async function probe(): Promise { return classify(samples, deploymentIds) } -/** Treat the whole fleet as degraded/unhealthy if any single deployment is. */ +/** Admit if ANY deployment is healthy. One deployment per model (and per + * region in the future) means a user's request routes to a specific + * deployment based on their chosen model — a degraded or unhealthy + * deployment for one model doesn't affect users whose model routes + * elsewhere, and `createFireworksRequestWithFallback` falls back to the + * standard Fireworks API on 5xx regardless. Only hold the queue when ALL + * deployments are non-healthy. Degraded beats unhealthy so + * observability/logs show we still have upstream reachable. */ export function classify( samples: PromSample[], deploymentIds: string[], ): FireworksHealth { - let worst: FireworksHealth = 'healthy' + if (deploymentIds.length === 0) return 'healthy' + + let anyDegraded = false for (const deploymentId of deploymentIds) { const h = classifyOne(samples, deploymentId) - if (h === 'unhealthy') return 'unhealthy' - if (h === 'degraded') worst = 'degraded' + if (h === 'healthy') return 'healthy' + if (h === 'degraded') anyDegraded = true } - return worst + return anyDegraded ? 'degraded' : 'unhealthy' } function classifyOne(samples: PromSample[], deploymentId: string): FireworksHealth {