From a6382caa2d5b47d7fdd7ac48b29f5870b2fcbe37 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Wed, 10 Jun 2026 15:05:20 -0400 Subject: [PATCH 01/25] updates --- .../llm-benchmark-validate-goldens.yml | 8 + .../src/bin/llm_benchmark.rs | 81 ++++++-- .../src/llm/clients/mod.rs | 76 ++++++++ .../src/llm/clients/openrouter.rs | 146 +++++++++++++- tools/xtask-llm-benchmark/src/llm/provider.rs | 181 +++++++++++------- 5 files changed, 405 insertions(+), 87 deletions(-) diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml index 17384a654e3..591d55a6a59 100644 --- a/.github/workflows/llm-benchmark-validate-goldens.yml +++ b/.github/workflows/llm-benchmark-validate-goldens.yml @@ -57,6 +57,13 @@ jobs: - name: Install pnpm if: matrix.lang == 'typescript' uses: ./.github/actions/setup-pnpm + with: + run_install: true + + - name: Build TypeScript SDK + if: matrix.lang == 'typescript' + run: pnpm build + working-directory: crates/bindings-typescript - name: Build llm-benchmark tool run: cargo install --path tools/xtask-llm-benchmark --locked @@ -72,5 +79,6 @@ jobs: env: MSBUILDDISABLENODEREUSE: "1" DOTNET_CLI_USE_MSBUILD_SERVER: "0" + LLM_BENCH_CSHARP_CONCURRENCY: "1" run: | llm_benchmark run --goldens-only --lang ${{ matrix.lang }} diff --git a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs index c624fdc4108..2a931fbaa4d 100644 --- a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs +++ b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs @@ -255,11 +255,6 @@ fn run_benchmarks(args: RunArgs) -> Result<()> { eprintln!("[warn] failed to upload task catalog: {e}"); } - let modes = config - .modes - .clone() - .unwrap_or_else(|| ALL_MODES.iter().map(|s| s.to_string()).collect()); - let RuntimeInit { runtime, provider: llm_provider, @@ -273,7 +268,29 @@ fn run_benchmarks(args: RunArgs) -> Result<()> { let selectors: Option> = config.selectors.clone(); let selectors_ref: Option<&[String]> = selectors.as_deref(); + let modes = config + .modes + .clone() + .unwrap_or_else(|| ALL_MODES.iter().map(|s| s.to_string()).collect()); + + if config.goldens_only { + let rt = runtime.as_ref().expect("runtime required for --goldens-only"); + rt.block_on(build_goldens_only_for_lang( + config.host.clone(), + &bench_root, + config.lang, + selectors_ref, + ))?; + println!("[{}] goldens-only build complete", config.lang.as_str()); + return Ok(()); + } + if !config.goldens_only && !config.hash_only { + let rt = runtime.as_ref().expect("failed to initialize runtime for preflight"); + let provider = llm_provider.as_ref().expect("llm provider required for preflight"); + let routes = filter_routes(&config); + preflight_llm_routes(rt, provider.as_ref(), &routes, &modes)?; + let rt = runtime.as_ref().expect("failed to initialize runtime for goldens"); rt.block_on(ensure_goldens_built_once( config.host.clone(), @@ -517,6 +534,51 @@ fn short_hash(s: &str) -> &str { &s[..s.len().min(12)] } +fn preflight_llm_routes( + runtime: &Runtime, + llm_provider: &dyn LlmProvider, + routes: &[ModelRoute], + modes: &[String], +) -> Result<()> { + if routes.is_empty() { + return Ok(()); + } + + let mut search_flags = Vec::new(); + if modes.iter().any(|mode| mode == "search") { + search_flags.push(true); + } + if modes.iter().any(|mode| mode != "search") { + search_flags.push(false); + } + + let mut failures = Vec::new(); + for route in routes { + for search_enabled in &search_flags { + let mode_label = if *search_enabled { + "search/OpenRouter online" + } else { + "standard" + }; + + if let Err(err) = runtime.block_on(llm_provider.preflight_route(route, *search_enabled)) { + let msg = format!("{} ({mode_label}): {err:#}", route.display_name); + eprintln!("[preflight] FAILED {msg}"); + failures.push(msg); + } + } + } + + if !failures.is_empty() { + anyhow::bail!( + "LLM provider preflight failed before benchmark run:\n - {}", + failures.join("\n - ") + ); + } + + Ok(()) +} + /// Run benchmarks for a single mode. fn run_mode_benchmarks( mode: &str, @@ -538,15 +600,6 @@ fn run_mode_benchmarks( return Ok(Vec::new()); } - if config.goldens_only { - let rt = runtime.expect("runtime required for --goldens-only"); - let sels = config.selectors.as_deref(); - - rt.block_on(build_goldens_only_for_lang(config.host.clone(), bench_root, lang, sels))?; - println!("{:<12} [{:<10}] goldens-only build complete", mode, lang_str); - return Ok(Vec::new()); - } - // Run benchmarks for all matching routes let routes = filter_routes(config); diff --git a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs index 172beef8ff8..d8eba39c4d0 100644 --- a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs +++ b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs @@ -8,6 +8,9 @@ pub mod openai; pub mod openrouter; pub mod xai; +use anyhow::{bail, Result}; +use async_trait::async_trait; + pub use anthropic::AnthropicClient; pub use deepseek::DeepSeekClient; pub use google::GoogleGeminiClient; @@ -15,3 +18,76 @@ pub use meta::MetaLlamaClient; pub use openai::OpenAiClient; pub use openrouter::OpenRouterClient; pub use xai::XaiGrokClient; + +use crate::llm::prompt::BuiltPrompt; +use crate::llm::types::LlmOutput; + +#[derive(Debug, Clone)] +pub struct ClientPreflight { + summary: String, +} + +impl ClientPreflight { + pub fn new(summary: impl Into) -> Self { + Self { + summary: summary.into(), + } + } + + pub fn summary(&self) -> &str { + &self.summary + } +} + +#[async_trait] +pub trait LlmClient: Send + Sync { + fn provider_name(&self) -> &'static str; + + async fn preflight(&self, model: &str) -> Result { + bail!( + "{} credit preflight is not implemented for model '{}'", + self.provider_name(), + model + ) + } + + async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result; +} + +macro_rules! impl_direct_llm_client { + ($ty:ty, $provider_name:literal) => { + #[async_trait] + impl LlmClient for $ty { + fn provider_name(&self) -> &'static str { + $provider_name + } + + async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result { + <$ty>::generate(self, model, prompt).await + } + } + }; +} + +impl_direct_llm_client!(OpenAiClient, "OpenAI"); +impl_direct_llm_client!(AnthropicClient, "Anthropic"); +impl_direct_llm_client!(GoogleGeminiClient, "Google"); +impl_direct_llm_client!(XaiGrokClient, "xAI"); +impl_direct_llm_client!(DeepSeekClient, "DeepSeek"); +impl_direct_llm_client!(MetaLlamaClient, "Meta"); + +#[async_trait] +impl LlmClient for OpenRouterClient { + fn provider_name(&self) -> &'static str { + "OpenRouter" + } + + async fn preflight(&self, _model: &str) -> Result { + let status = self.preflight_credits().await?; + Ok(ClientPreflight::new(status.summary())) + } + + async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result { + OpenRouterClient::generate(self, model, prompt).await + } +} diff --git a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs index 623570298af..d35ce789d17 100644 --- a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs +++ b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs @@ -1,5 +1,6 @@ -use anyhow::{anyhow, Context, Result}; -use serde::Serialize; +use anyhow::{anyhow, bail, Context, Result}; +use serde::{Deserialize, Serialize}; +use std::env; use super::http::HttpClient; use super::oa_compat::OACompatResp; @@ -35,6 +36,81 @@ impl OpenRouterClient { Self { base, api_key, http } } + pub async fn preflight_credits(&self) -> Result { + let key_info = self.fetch_key_info().await?; + let min_credits = min_credits_threshold(); + + if let Some(remaining) = key_info.limit_remaining + && remaining <= min_credits + { + bail!( + "OpenRouter API key has insufficient remaining credits: {:.4} <= {:.4}", + remaining, + min_credits + ); + } + + let account = match env::var("OPENROUTER_MANAGEMENT_API_KEY") + .ok() + .filter(|v| !v.trim().is_empty()) + { + Some(key) => Some(self.fetch_account_credits(&key).await?), + None => None, + }; + + if let Some(account) = &account + && account.remaining <= min_credits + { + bail!( + "OpenRouter account has insufficient remaining credits: {:.4} <= {:.4}", + account.remaining, + min_credits + ); + } + + if account.is_none() && key_info.limit_remaining.is_none() { + bail!( + "OpenRouter API key has no configured credit limit and account credits were not checked. \ + Set OPENROUTER_MANAGEMENT_API_KEY for account balance preflight." + ); + } + + Ok(OpenRouterCreditStatus { + key_limit: key_info.limit, + key_limit_remaining: key_info.limit_remaining, + account_remaining: account.map(|a| a.remaining), + min_credits, + }) + } + + async fn fetch_key_info(&self) -> Result { + let url = format!("{}/key", self.base.trim_end_matches('/')); + let auth = HttpClient::bearer(&self.api_key); + let body = self + .http + .get_text(&url, &[auth]) + .await + .with_context(|| format!("OpenRouter key preflight GET {}", url))?; + + let resp: OpenRouterKeyResp = serde_json::from_str(&body).context("parse OpenRouter key response")?; + Ok(resp.data) + } + + async fn fetch_account_credits(&self, management_key: &str) -> Result { + let url = format!("{}/credits", self.base.trim_end_matches('/')); + let auth = HttpClient::bearer(management_key); + let body = self + .http + .get_text(&url, &[auth]) + .await + .with_context(|| format!("OpenRouter account credit preflight GET {}", url))?; + + let resp: OpenRouterCreditsResp = serde_json::from_str(&body).context("parse OpenRouter credits response")?; + Ok(OpenRouterAccountCredits { + remaining: resp.data.total_credits - resp.data.total_usage, + }) + } + pub async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result { let url = format!("{}/chat/completions", self.base.trim_end_matches('/')); @@ -125,6 +201,72 @@ impl OpenRouterClient { } } +#[derive(Debug, Clone)] +pub struct OpenRouterCreditStatus { + pub key_limit: Option, + pub key_limit_remaining: Option, + pub account_remaining: Option, + pub min_credits: f64, +} + +impl OpenRouterCreditStatus { + pub fn summary(&self) -> String { + let key_remaining = match (self.key_limit, self.key_limit_remaining) { + (Some(limit), Some(remaining)) => format!("key remaining {remaining:.4}/{limit:.4}"), + (Some(limit), None) => format!("key limit {limit:.4}, remaining unknown"), + (None, Some(remaining)) => format!("key remaining {remaining:.4}"), + (None, None) => "key has no configured limit".to_string(), + }; + + match self.account_remaining { + Some(remaining) => { + format!( + "{key_remaining}; account remaining {remaining:.4}; min {:.4}", + self.min_credits + ) + } + None => format!( + "{key_remaining}; account balance not checked (set OPENROUTER_MANAGEMENT_API_KEY); min {:.4}", + self.min_credits + ), + } + } +} + +#[derive(Debug, Deserialize)] +struct OpenRouterKeyResp { + data: OpenRouterKeyInfo, +} + +#[derive(Debug, Deserialize)] +struct OpenRouterKeyInfo { + limit: Option, + limit_remaining: Option, +} + +#[derive(Debug, Deserialize)] +struct OpenRouterCreditsResp { + data: OpenRouterCreditsData, +} + +#[derive(Debug, Deserialize)] +struct OpenRouterCreditsData { + total_credits: f64, + total_usage: f64, +} + +#[derive(Debug, Clone)] +struct OpenRouterAccountCredits { + remaining: f64, +} + +fn min_credits_threshold() -> f64 { + env::var("LLM_MIN_CREDITS") + .ok() + .and_then(|v| v.trim().parse::().ok()) + .unwrap_or(0.0) +} + /// Context limits for models accessed via OpenRouter. /// Uses the same limits as direct clients where known, /// falls back to a conservative default. diff --git a/tools/xtask-llm-benchmark/src/llm/provider.rs b/tools/xtask-llm-benchmark/src/llm/provider.rs index 65d587d9526..fd906f62773 100644 --- a/tools/xtask-llm-benchmark/src/llm/provider.rs +++ b/tools/xtask-llm-benchmark/src/llm/provider.rs @@ -2,7 +2,8 @@ use anyhow::{Context, Result}; use async_trait::async_trait; use crate::llm::clients::{ - AnthropicClient, DeepSeekClient, GoogleGeminiClient, MetaLlamaClient, OpenAiClient, OpenRouterClient, XaiGrokClient, + AnthropicClient, DeepSeekClient, GoogleGeminiClient, LlmClient, MetaLlamaClient, OpenAiClient, OpenRouterClient, + XaiGrokClient, }; use crate::llm::model_routes::ModelRoute; use crate::llm::prompt::BuiltPrompt; @@ -10,6 +11,7 @@ use crate::llm::types::{LlmOutput, Vendor}; #[async_trait] pub trait LlmProvider: Send + Sync { + async fn preflight_route(&self, route: &ModelRoute, search_enabled: bool) -> Result<()>; async fn generate(&self, route: &ModelRoute, prompt: &BuiltPrompt) -> Result; } @@ -51,98 +53,135 @@ impl RouterProvider { } } +struct ResolvedClient<'a> { + client: &'a dyn LlmClient, + endpoint_name: &'static str, + model: String, + fallback_from: Option<&'static str>, + search_enabled: bool, +} + #[async_trait] impl LlmProvider for RouterProvider { + async fn preflight_route(&self, route: &ModelRoute, search_enabled: bool) -> Result<()> { + let resolved = self.resolve_client(route, search_enabled)?; + let status = resolved.client.preflight(&resolved.model).await.with_context(|| { + format!( + "{} credit preflight failed for model '{}'", + resolved.endpoint_name, resolved.model + ) + })?; + + eprintln!( + "[preflight] {} -> {} '{}' OK ({})", + route.display_name, + resolved.endpoint_name, + resolved.model, + status.summary() + ); + Ok(()) + } + async fn generate(&self, route: &ModelRoute, prompt: &BuiltPrompt) -> Result { - // Web search mode: route all models through OpenRouter with :online suffix. - // OpenRouter's :online feature adds Bing-powered web search to any model. - if prompt.search_enabled { - let cli = self.openrouter.as_ref().context( - "Search mode requires OPENROUTER_API_KEY — OpenRouter provides unified web search via :online models", - )?; + let resolved = self.resolve_client(route, prompt.search_enabled)?; + + if resolved.search_enabled { + eprintln!( + "[search] {} -> OpenRouter :online model '{}'", + route.display_name, resolved.model + ); + } else if let Some(vendor_name) = resolved.fallback_from { + eprintln!( + "[openrouter] {} client not configured, falling back to OpenRouter for model '{}'", + vendor_name, resolved.model + ); + } + + resolved.client.generate(&resolved.model, prompt).await + } +} + +impl RouterProvider { + fn resolve_client<'a>(&'a self, route: &ModelRoute, search_enabled: bool) -> Result> { + if search_enabled { let base_model = route .openrouter_model .clone() .unwrap_or_else(|| openrouter_model_id(route.vendor, &route.api_model)); - let online_model = format!("{base_model}:online"); - eprintln!( - "[search] {} → OpenRouter :online model '{}'", - route.display_name, online_model - ); - return cli.generate(&online_model, prompt).await; + return self.resolve_openrouter(format!("{base_model}:online"), None, true); } let vendor = self.force.unwrap_or(route.vendor); - // If vendor is explicitly OpenRouter, or if the direct client isn't configured - // but OpenRouter is available, route through OpenRouter. if vendor == Vendor::OpenRouter { - let cli = self - .openrouter - .as_ref() - .context("OpenRouter client not configured (set OPENROUTER_API_KEY)")?; let model = route.openrouter_model.as_deref().unwrap_or(&route.api_model); - return cli.generate(model, prompt).await; + return self.resolve_openrouter(model.to_string(), None, false); } - // Try direct client first, fall back to OpenRouter if available. match vendor { - Vendor::OpenAi => match self.openai.as_ref() { - Some(cli) => cli.generate(&route.api_model, prompt).await, - None => self.fallback_openrouter(route, prompt, "OpenAI").await, - }, - Vendor::Anthropic => match self.anthropic.as_ref() { - Some(cli) => cli.generate(&route.api_model, prompt).await, - None => self.fallback_openrouter(route, prompt, "Anthropic").await, - }, - Vendor::Google => match self.google.as_ref() { - Some(cli) => cli.generate(&route.api_model, prompt).await, - None => self.fallback_openrouter(route, prompt, "Google").await, - }, - Vendor::Xai => match self.xai.as_ref() { - Some(cli) => cli.generate(&route.api_model, prompt).await, - None => self.fallback_openrouter(route, prompt, "xAI").await, - }, - Vendor::DeepSeek => match self.deepseek.as_ref() { - Some(cli) => cli.generate(&route.api_model, prompt).await, - None => self.fallback_openrouter(route, prompt, "DeepSeek").await, - }, - Vendor::Meta => match self.meta.as_ref() { - Some(cli) => cli.generate(&route.api_model, prompt).await, - None => self.fallback_openrouter(route, prompt, "Meta").await, - }, + Vendor::OpenAi => { + self.resolve_direct_or_openrouter(self.openai.as_ref().map(|c| c as &dyn LlmClient), route, vendor) + } + Vendor::Anthropic => { + self.resolve_direct_or_openrouter(self.anthropic.as_ref().map(|c| c as &dyn LlmClient), route, vendor) + } + Vendor::Google => { + self.resolve_direct_or_openrouter(self.google.as_ref().map(|c| c as &dyn LlmClient), route, vendor) + } + Vendor::Xai => { + self.resolve_direct_or_openrouter(self.xai.as_ref().map(|c| c as &dyn LlmClient), route, vendor) + } + Vendor::DeepSeek => { + self.resolve_direct_or_openrouter(self.deepseek.as_ref().map(|c| c as &dyn LlmClient), route, vendor) + } + Vendor::Meta => { + self.resolve_direct_or_openrouter(self.meta.as_ref().map(|c| c as &dyn LlmClient), route, vendor) + } Vendor::OpenRouter => unreachable!("handled above"), } } -} -impl RouterProvider { - /// Fall back to the OpenRouter client when a direct vendor client is not configured. - async fn fallback_openrouter( - &self, + fn resolve_direct_or_openrouter<'a>( + &'a self, + direct: Option<&'a dyn LlmClient>, route: &ModelRoute, - prompt: &BuiltPrompt, - vendor_name: &str, - ) -> Result { - match self.openrouter.as_ref() { - Some(cli) => { - let or_model = route - .openrouter_model - .clone() - .unwrap_or_else(|| openrouter_model_id(route.vendor, &route.api_model)); - eprintln!( - "[openrouter] {} client not configured, falling back to OpenRouter for model '{}'", - vendor_name, or_model - ); - cli.generate(&or_model, prompt).await - } - None => anyhow::bail!( - "{} client not configured and no OpenRouter fallback available. \ - Set {}_API_KEY or OPENROUTER_API_KEY.", - vendor_name, - vendor_name.to_ascii_uppercase() - ), + vendor: Vendor, + ) -> Result> { + if let Some(client) = direct { + return Ok(ResolvedClient { + client, + endpoint_name: vendor.display_name(), + model: route.api_model.clone(), + fallback_from: None, + search_enabled: false, + }); } + + let model = route + .openrouter_model + .clone() + .unwrap_or_else(|| openrouter_model_id(route.vendor, &route.api_model)); + self.resolve_openrouter(model, Some(vendor.display_name()), false) + } + + fn resolve_openrouter<'a>( + &'a self, + model: String, + fallback_from: Option<&'static str>, + search_enabled: bool, + ) -> Result> { + let client = self + .openrouter + .as_ref() + .context("OpenRouter client not configured (set OPENROUTER_API_KEY)")?; + + Ok(ResolvedClient { + client, + endpoint_name: "OpenRouter", + model, + fallback_from, + search_enabled, + }) } } From 711ff882a2cb421c6919f629cb5e2eaee594d4f7 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Thu, 11 Jun 2026 10:01:49 -0400 Subject: [PATCH 02/25] Update provider.rs --- tools/xtask-llm-benchmark/src/llm/provider.rs | 72 ++++++++----------- 1 file changed, 31 insertions(+), 41 deletions(-) diff --git a/tools/xtask-llm-benchmark/src/llm/provider.rs b/tools/xtask-llm-benchmark/src/llm/provider.rs index fd906f62773..355f2e19a3e 100644 --- a/tools/xtask-llm-benchmark/src/llm/provider.rs +++ b/tools/xtask-llm-benchmark/src/llm/provider.rs @@ -1,5 +1,6 @@ use anyhow::{Context, Result}; use async_trait::async_trait; +use std::collections::HashMap; use crate::llm::clients::{ AnthropicClient, DeepSeekClient, GoogleGeminiClient, LlmClient, MetaLlamaClient, OpenAiClient, OpenRouterClient, @@ -16,15 +17,7 @@ pub trait LlmProvider: Send + Sync { } pub struct RouterProvider { - pub openai: Option, - pub anthropic: Option, - pub google: Option, - pub xai: Option, - pub deepseek: Option, - pub meta: Option, - /// OpenRouter client used as a unified fallback when a direct vendor client - /// is not configured. Set via `OPENROUTER_API_KEY`. - pub openrouter: Option, + clients: HashMap>, pub force: Option, } @@ -40,16 +33,31 @@ impl RouterProvider { openrouter: Option, force: Option, ) -> Self { - Self { - openai, - anthropic, - google, - xai, - deepseek, - meta, - openrouter, - force, + let mut clients: HashMap> = HashMap::new(); + + if let Some(client) = openai { + clients.insert(Vendor::OpenAi, Box::new(client)); + } + if let Some(client) = anthropic { + clients.insert(Vendor::Anthropic, Box::new(client)); + } + if let Some(client) = google { + clients.insert(Vendor::Google, Box::new(client)); + } + if let Some(client) = xai { + clients.insert(Vendor::Xai, Box::new(client)); + } + if let Some(client) = deepseek { + clients.insert(Vendor::DeepSeek, Box::new(client)); } + if let Some(client) = meta { + clients.insert(Vendor::Meta, Box::new(client)); + } + if let Some(client) = openrouter { + clients.insert(Vendor::OpenRouter, Box::new(client)); + } + + Self { clients, force } } } @@ -118,27 +126,8 @@ impl RouterProvider { return self.resolve_openrouter(model.to_string(), None, false); } - match vendor { - Vendor::OpenAi => { - self.resolve_direct_or_openrouter(self.openai.as_ref().map(|c| c as &dyn LlmClient), route, vendor) - } - Vendor::Anthropic => { - self.resolve_direct_or_openrouter(self.anthropic.as_ref().map(|c| c as &dyn LlmClient), route, vendor) - } - Vendor::Google => { - self.resolve_direct_or_openrouter(self.google.as_ref().map(|c| c as &dyn LlmClient), route, vendor) - } - Vendor::Xai => { - self.resolve_direct_or_openrouter(self.xai.as_ref().map(|c| c as &dyn LlmClient), route, vendor) - } - Vendor::DeepSeek => { - self.resolve_direct_or_openrouter(self.deepseek.as_ref().map(|c| c as &dyn LlmClient), route, vendor) - } - Vendor::Meta => { - self.resolve_direct_or_openrouter(self.meta.as_ref().map(|c| c as &dyn LlmClient), route, vendor) - } - Vendor::OpenRouter => unreachable!("handled above"), - } + let direct = self.clients.get(&vendor).map(|client| client.as_ref()); + self.resolve_direct_or_openrouter(direct, route, vendor) } fn resolve_direct_or_openrouter<'a>( @@ -171,8 +160,9 @@ impl RouterProvider { search_enabled: bool, ) -> Result> { let client = self - .openrouter - .as_ref() + .clients + .get(&Vendor::OpenRouter) + .map(|client| client.as_ref()) .context("OpenRouter client not configured (set OPENROUTER_API_KEY)")?; Ok(ResolvedClient { From e82f0aef0f4e37b220b95f8d16cf14bc2a493eaf Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Thu, 11 Jun 2026 14:45:48 -0400 Subject: [PATCH 03/25] updates --- .github/workflows/llm-benchmark-periodic.yml | 25 +- tools/xtask-llm-benchmark/src/api/client.rs | 327 +++++++++++++++++- .../xtask-llm-benchmark/src/bench/analysis.rs | 6 +- .../src/bench/publishers.rs | 2 + tools/xtask-llm-benchmark/src/bench/runner.rs | 5 +- tools/xtask-llm-benchmark/src/bench/types.rs | 5 + .../src/bin/llm_benchmark.rs | 282 +++++++++++++-- .../src/llm/clients/anthropic.rs | 6 + .../src/llm/clients/openrouter.rs | 24 +- .../src/llm/model_routes.rs | 43 ++- .../src/llm/segmentation.rs | 38 +- 11 files changed, 685 insertions(+), 78 deletions(-) diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml index 40ad2c75fe4..c0d9cc93d8c 100644 --- a/.github/workflows/llm-benchmark-periodic.yml +++ b/.github/workflows/llm-benchmark-periodic.yml @@ -2,11 +2,14 @@ name: Periodic LLM benchmarks on: schedule: - # Daily at midnight UTC. Change to '0 */6 * * *' for every 6h, - # or '0 */4 * * *' for every 4h. - - cron: '0 0 * * *' + # Weekly on Monday at midnight UTC. + - cron: '0 0 * * 1' workflow_dispatch: inputs: + run_id: + description: 'Website-created benchmark run id. When set, run spec is fetched from the website.' + required: false + default: '' models: description: 'Models to run (provider:model format, comma-separated, or "all")' required: false @@ -62,6 +65,12 @@ jobs: - name: Install pnpm uses: ./.github/actions/setup-pnpm + with: + run_install: true + + - name: Build TypeScript SDK + run: pnpm build + working-directory: crates/bindings-typescript - name: Build llm-benchmark tool run: cargo install --path tools/xtask-llm-benchmark --locked @@ -82,19 +91,27 @@ jobs: LLM_BENCHMARK_UPLOAD_URL: ${{ secrets.LLM_BENCHMARK_UPLOAD_URL }} MSBUILDDISABLENODEREUSE: "1" DOTNET_CLI_USE_MSBUILD_SERVER: "0" + LLM_BENCH_CSHARP_CONCURRENCY: "1" + INPUT_RUN_ID: ${{ inputs.run_id || '' }} INPUT_LANGUAGES: ${{ inputs.languages || 'rust,csharp,typescript' }} INPUT_MODELS: ${{ inputs.models || 'all' }} INPUT_MODES: ${{ inputs.modes || 'guidelines,no_context' }} run: | + RUN_ID="$INPUT_RUN_ID" LANGS="$INPUT_LANGUAGES" MODELS="$INPUT_MODELS" MODES="$INPUT_MODES" + if [ -n "$RUN_ID" ]; then + llm_benchmark run-from-api --run-id "$RUN_ID" + exit $? + fi + SUCCEEDED=0 FAILED=0 for LANG in $(echo "$LANGS" | tr ',' ' '); do if [ "$MODELS" = "all" ]; then - if llm_benchmark run --lang "$LANG" --modes "$MODES"; then + if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote; then SUCCEEDED=$((SUCCEEDED + 1)) else echo "::warning::Benchmark run failed for lang=$LANG" diff --git a/tools/xtask-llm-benchmark/src/api/client.rs b/tools/xtask-llm-benchmark/src/api/client.rs index edc61756152..13fde0f8e81 100644 --- a/tools/xtask-llm-benchmark/src/api/client.rs +++ b/tools/xtask-llm-benchmark/src/api/client.rs @@ -1,14 +1,155 @@ -use anyhow::{Context, Result}; +use anyhow::{anyhow, Context, Result}; +use serde::Deserialize; use serde_json::json; +use std::str::FromStr; use crate::bench::normalize::{canonical_mode, normalize_model_names}; use crate::bench::types::{Results, RunOutcome}; +use crate::eval::Lang; +use crate::llm::types::Vendor; +use crate::llm::ModelRoute; + +#[derive(Debug, Clone)] +pub struct RemoteRunSpec { + pub run_id: String, + pub languages: Vec, + pub modes: Vec, + pub routes: Vec, + pub categories: Option>, + pub tasks: Option>, +} + +#[derive(Debug, Deserialize)] +struct RemoteModelRouteRow { + #[serde(alias = "displayName", alias = "name")] + display_name: String, + vendor: String, + #[serde(alias = "apiModel")] + api_model: String, + #[serde(default, alias = "openrouterModel")] + openrouter_model: Option, + #[serde(default)] + active: Option, + #[serde(default)] + available: Option, +} + +#[derive(Debug, Deserialize)] +struct RawRunSpec { + #[serde(default)] + id: Option, + #[serde(default, alias = "runId")] + run_id: Option, + languages: Vec, + modes: Vec, + #[serde(default, alias = "routes")] + models: Vec, + #[serde(default)] + categories: Option>, + #[serde(default)] + tasks: Option>, +} + +fn parse_model_route_row(row: RemoteModelRouteRow) -> Result> { + if row.active == Some(false) || row.available == Some(false) { + return Ok(None); + } + + let vendor = Vendor::parse(&row.vendor).ok_or_else(|| anyhow!("unknown model vendor '{}'", row.vendor))?; + let display_name = row.display_name.trim(); + let api_model = row.api_model.trim(); + + if display_name.is_empty() { + anyhow::bail!("remote model row is missing display_name"); + } + if api_model.is_empty() { + anyhow::bail!("remote model row '{}' is missing api_model", display_name); + } + + Ok(Some(ModelRoute::new( + display_name, + vendor, + api_model, + row.openrouter_model.as_deref().filter(|s| !s.trim().is_empty()), + ))) +} + +pub fn parse_model_routes_response(body: &serde_json::Value) -> Result> { + let models = body.get("models").unwrap_or(body); + let rows: Vec = + serde_json::from_value(models.clone()).context("parse llm benchmark model rows")?; + + let mut routes = Vec::new(); + for row in rows { + if let Some(route) = parse_model_route_row(row)? { + routes.push(route); + } + } + + if routes.is_empty() { + anyhow::bail!("no active available LLM benchmark models returned by website"); + } + + Ok(routes) +} + +pub fn parse_run_spec_response(body: &serde_json::Value, fallback_run_id: &str) -> Result { + let spec = body.get("spec").or_else(|| body.get("spec_json")).unwrap_or(body); + let spec = match spec.as_str() { + Some(s) => serde_json::from_str::(s).context("parse run spec_json string")?, + None => spec.clone(), + }; + + let raw: RawRunSpec = serde_json::from_value(spec).context("parse llm benchmark run spec")?; + let run_id = raw.run_id.or(raw.id).unwrap_or_else(|| fallback_run_id.to_string()); + + let languages = raw + .languages + .iter() + .map(|lang| Lang::from_str(lang).map_err(|e| anyhow!(e))) + .collect::>>()?; + if languages.is_empty() { + anyhow::bail!("run spec '{}' has no languages", run_id); + } + + let modes: Vec = raw + .modes + .into_iter() + .map(|mode| mode.trim().to_string()) + .filter(|mode| !mode.is_empty()) + .collect(); + if modes.is_empty() { + anyhow::bail!("run spec '{}' has no modes", run_id); + } + + let mut routes = Vec::new(); + for row in raw.models { + if let Some(route) = parse_model_route_row(row)? { + routes.push(route); + } + } + if routes.is_empty() { + anyhow::bail!("run spec '{}' has no active available models", run_id); + } + + Ok(RemoteRunSpec { + run_id, + languages, + modes, + routes, + categories: raw.categories, + tasks: raw.tasks, + }) +} /// HTTP client for the SpacetimeDB LLM benchmark API (spacetime-web Postgres). /// -/// Supports two POST endpoints that already exist in spacetime-web: -/// - `POST /api/llm-benchmark-upload` — upload benchmark results -/// - `POST /api/llm-benchmark-tasks` — upload task catalog +/// Supports endpoints owned by spacetime-web: +/// - `POST /api/llm-benchmark-upload` - upload benchmark results +/// - `POST /api/llm-benchmark-tasks` - upload task catalog +/// - `GET /api/llm-benchmark-models?active=true` - fetch active benchmark models +/// - `GET /api/llm-benchmark-runs/{run_id}` - fetch admin-triggered run specs +/// - `PATCH /api/llm-benchmark-runs/{run_id}` - update admin-triggered run status #[derive(Clone)] pub struct ApiClient { client: reqwest::blocking::Client, @@ -44,7 +185,13 @@ impl ApiClient { /// Upload a batch of run outcomes for a single (lang, mode) combination. /// Normalizes model names and sanitizes volatile fields before upload. /// If `analysis` is provided, it is stored in the `llm_benchmark_analysis` table. - pub fn upload_batch(&self, mode: &str, outcomes: &[RunOutcome], analysis: Option<&str>) -> Result { + pub fn upload_batch( + &self, + mode: &str, + outcomes: &[RunOutcome], + analysis: Option<&str>, + run_id: Option<&str>, + ) -> Result { if outcomes.is_empty() { return Ok(0); } @@ -85,12 +232,15 @@ impl ApiClient { } } - let payload = json!({ + let mut payload = json!({ "lang": lang_entry.lang, "mode": mode_entry.mode, "hash": mode_entry.hash, "models": models_json, }); + if let Some(run_id) = run_id { + payload["run_id"] = json!(run_id); + } let resp = self .client @@ -113,7 +263,7 @@ impl ApiClient { let status = resp.status(); let body = resp.text().unwrap_or_default(); anyhow::bail!( - "upload failed for {}/{}: {} — {}", + "upload failed for {}/{}: {} - {}", lang_entry.lang, mode_entry.mode, status, @@ -126,6 +276,100 @@ impl ApiClient { Ok(total_uploaded) } + /// Fetch active/available benchmark models from the website model registry. + pub fn fetch_model_routes(&self) -> Result> { + let url = format!("{}/api/llm-benchmark-models?active=true", self.base_url); + let resp = self + .client + .get(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .send() + .context("fetch LLM benchmark models failed")?; + + if resp.status().is_success() { + let body: serde_json::Value = resp.json().context("parse model registry response")?; + parse_model_routes_response(&body) + } else { + let status = resp.status(); + let body = resp.text().unwrap_or_default(); + anyhow::bail!("fetch LLM benchmark models failed: {} - {}", status, body); + } + } + + /// Fetch an immutable website-created run spec for admin-triggered runs. + pub fn fetch_run_spec(&self, run_id: &str) -> Result { + let run_id_path = urlencoding::encode(run_id); + let url = format!("{}/api/llm-benchmark-runs/{}", self.base_url, run_id_path); + let resp = self + .client + .get(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .send() + .with_context(|| format!("fetch LLM benchmark run spec failed for {run_id}"))?; + + if resp.status().is_success() { + let body: serde_json::Value = resp.json().context("parse run spec response")?; + parse_run_spec_response(&body, run_id) + } else { + let status = resp.status(); + let body = resp.text().unwrap_or_default(); + anyhow::bail!( + "fetch LLM benchmark run spec failed for {}: {} - {}", + run_id, + status, + body + ); + } + } + + /// Update website-created benchmark run status. + pub fn update_run_status(&self, run_id: &str, status: &str, error: Option<&str>) -> Result<()> { + let run_id_path = urlencoding::encode(run_id); + let url = format!("{}/api/llm-benchmark-runs/{}", self.base_url, run_id_path); + let mut payload = json!({ + "status": status, + }); + if let Some(error) = error { + payload["error"] = json!(error); + } + if let Ok(github_run_id) = std::env::var("GITHUB_RUN_ID") + && !github_run_id.is_empty() + { + payload["github_run_id"] = json!(github_run_id); + if let (Ok(server_url), Ok(repo)) = (std::env::var("GITHUB_SERVER_URL"), std::env::var("GITHUB_REPOSITORY")) + { + payload["github_run_url"] = json!(format!( + "{}/{}/actions/runs/{}", + server_url.trim_end_matches('/'), + repo, + payload["github_run_id"].as_str().unwrap_or_default() + )); + } + } + + let resp = self + .client + .patch(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .header("Content-Type", "application/json") + .json(&payload) + .send() + .with_context(|| format!("update LLM benchmark run status failed for {run_id}"))?; + + if resp.status().is_success() { + Ok(()) + } else { + let status_code = resp.status(); + let body = resp.text().unwrap_or_default(); + anyhow::bail!( + "update LLM benchmark run status failed for {}: {} - {}", + run_id, + status_code, + body + ); + } + } + /// Upload the task catalog to `POST /api/llm-benchmark-tasks`, derived from /// the benchmarks directory structure on disk. pub fn upload_task_catalog(&self, bench_root: &std::path::Path) -> Result { @@ -334,3 +578,72 @@ impl ApiClient { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parses_active_available_model_routes() { + let body = json!({ + "models": [ + { + "displayName": "GPT Test", + "vendor": "openai", + "apiModel": "gpt-test", + "openrouterModel": "openai/gpt-test", + "active": true, + "available": true + }, + { + "displayName": "Inactive", + "vendor": "openai", + "apiModel": "inactive", + "active": false, + "available": true + }, + { + "displayName": "Unavailable", + "vendor": "openai", + "apiModel": "unavailable", + "active": true, + "available": false + } + ] + }); + + let routes = parse_model_routes_response(&body).unwrap(); + assert_eq!(routes.len(), 1); + assert_eq!(routes[0].display_name, "GPT Test"); + assert_eq!(routes[0].vendor, Vendor::OpenAi); + assert_eq!(routes[0].api_model, "gpt-test"); + assert_eq!(routes[0].openrouter_model.as_deref(), Some("openai/gpt-test")); + } + + #[test] + fn parses_run_spec_response() { + let body = json!({ + "spec_json": { + "languages": ["rust", "typescript"], + "modes": ["guidelines", "no_context"], + "categories": ["basics"], + "tasks": ["t_001_basic_tables"], + "models": [{ + "display_name": "Claude Test", + "vendor": "anthropic", + "api_model": "claude-test", + "openrouter_model": "anthropic/claude-test" + }] + } + }); + + let spec = parse_run_spec_response(&body, "run-123").unwrap(); + assert_eq!(spec.run_id, "run-123"); + assert_eq!(spec.languages, vec![Lang::Rust, Lang::TypeScript]); + assert_eq!(spec.modes, vec!["guidelines", "no_context"]); + assert_eq!(spec.categories.as_deref(), Some(&["basics".to_string()][..])); + assert_eq!(spec.tasks.as_deref(), Some(&["t_001_basic_tables".to_string()][..])); + assert_eq!(spec.routes.len(), 1); + assert_eq!(spec.routes[0].vendor, Vendor::Anthropic); + } +} diff --git a/tools/xtask-llm-benchmark/src/bench/analysis.rs b/tools/xtask-llm-benchmark/src/bench/analysis.rs index 0234cba1b8f..cb23fbb6cf5 100644 --- a/tools/xtask-llm-benchmark/src/bench/analysis.rs +++ b/tools/xtask-llm-benchmark/src/bench/analysis.rs @@ -27,10 +27,10 @@ pub async fn run_analysis( let prompt = build_prompt(lang, mode, model_name, bench_root, &failures); let route = ModelRoute::new( - "gpt-4.1-mini", + "gpt-5.4-mini", crate::llm::types::Vendor::OpenAi, - "gpt-4.1-mini", - Some("openai/gpt-4.1-mini"), + "gpt-5.4-mini", + Some("openai/gpt-5.4-mini"), ); let built = BuiltPrompt { diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs index 68775ff631c..55b8a98d5b5 100644 --- a/tools/xtask-llm-benchmark/src/bench/publishers.rs +++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs @@ -73,6 +73,8 @@ fn is_transient_build_error(stderr: &str, stdout: &str) -> bool { // trying to extract the same tarball simultaneously || (combined.contains("wasi-sdk") && combined.contains("tar")) || (combined.contains("MSB3073") && combined.contains("exited with code 2")) + // dotnet can crash below spacetime while spacetime exits 1. + || combined.contains("code Result<()> { diff --git a/tools/xtask-llm-benchmark/src/bench/runner.rs b/tools/xtask-llm-benchmark/src/bench/runner.rs index 42acd77a70c..92acc429b70 100644 --- a/tools/xtask-llm-benchmark/src/bench/runner.rs +++ b/tools/xtask-llm-benchmark/src/bench/runner.rs @@ -633,7 +633,7 @@ pub async fn run_all_for_model_async_for_lang(cfg: &BenchRunContext<'_>) -> Resu } }; if let Some(ref api) = cfg.api_client { - api.upload_batch(cfg.mode, &outcomes, analysis.as_deref())?; + api.upload_batch(cfg.mode, &outcomes, analysis.as_deref(), cfg.run_id.as_deref())?; } else { eprintln!("[runner] no API client configured; skipping upload"); } @@ -832,7 +832,7 @@ pub async fn run_selected_for_model_async_for_lang(cfg: &BenchRunContext<'_>) -> } }; if let Some(ref api) = cfg.api_client { - api.upload_batch(cfg.mode, &outcomes, analysis.as_deref())?; + api.upload_batch(cfg.mode, &outcomes, analysis.as_deref(), cfg.run_id.as_deref())?; } else { eprintln!("[runner] no API client configured; skipping upload"); } @@ -865,6 +865,7 @@ pub async fn run_selected_or_all_for_model_async_for_lang(ctx: &BenchRunContext< dry_run: ctx.dry_run, local_analysis: ctx.local_analysis, dry_run_id: ctx.dry_run_id.clone(), + run_id: ctx.run_id.clone(), }; return run_selected_for_model_async_for_lang(&sel_cfg).await; } diff --git a/tools/xtask-llm-benchmark/src/bench/types.rs b/tools/xtask-llm-benchmark/src/bench/types.rs index 930e3feac1c..57bbed9a149 100644 --- a/tools/xtask-llm-benchmark/src/bench/types.rs +++ b/tools/xtask-llm-benchmark/src/bench/types.rs @@ -188,6 +188,7 @@ pub struct BenchRunContext<'a> { pub dry_run: bool, pub local_analysis: bool, pub dry_run_id: Option, + pub run_id: Option, } pub struct RunConfig { @@ -209,4 +210,8 @@ pub struct RunConfig { pub local_analysis: bool, /// Shared identifier used to group dry-run artifacts pub dry_run_id: Option, + /// Website-created run identifier for uploaded results + pub run_id: Option, + /// Website-provided route list used instead of static default_model_routes() + pub route_overrides: Option>, } diff --git a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs index 2a931fbaa4d..219a770b502 100644 --- a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs +++ b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs @@ -1,7 +1,7 @@ #![allow(clippy::disallowed_macros, clippy::type_complexity, clippy::enum_variant_names)] use anyhow::{Context, Result}; -use clap::{Args, Parser, Subcommand}; +use clap::{Args, Parser, Subcommand, ValueEnum}; use futures::{StreamExt, TryStreamExt}; use spacetimedb_data_structures::map::{HashCollectionExt as _, HashMap, HashSet}; use spacetimedb_guard::SpacetimeDbGuard; @@ -71,11 +71,20 @@ struct Cli { command: Commands, } +#[derive(Clone, Copy, Debug, Eq, PartialEq, ValueEnum)] +enum ModelSource { + Static, + Remote, +} + #[derive(Subcommand, Debug)] enum Commands { /// Run benchmarks / build goldens / compute hashes. Run(RunArgs), + /// Run a website-created benchmark spec by id. + RunFromApi(RunFromApiArgs), + /// Run AI analysis on existing benchmark failures from the database. Analyze(AnalyzeArgs), } @@ -124,6 +133,10 @@ struct RunArgs { #[arg(long, num_args = 1..)] models: Option>, + /// Where to resolve models when --models is not provided + #[arg(long, value_enum, default_value_t = ModelSource::Static)] + model_source: ModelSource, + /// Run benchmarks without uploading results #[arg(long)] dry_run: bool, @@ -131,6 +144,19 @@ struct RunArgs { /// When used with --dry-run, also generate local markdown analysis files #[arg(long, requires = "dry_run")] local_analysis: bool, + + #[arg(skip)] + route_overrides: Option>, + + #[arg(skip)] + run_id: Option, +} + +#[derive(Args, Debug, Clone)] +struct RunFromApiArgs { + /// Website-created llm_benchmark_runs id + #[arg(long)] + run_id: String, } #[derive(Args, Debug, Clone)] @@ -202,6 +228,7 @@ fn main() -> Result<()> { match cli.command { Commands::Run(args) => cmd_run(args), + Commands::RunFromApi(args) => cmd_run_from_api(args), Commands::Analyze(args) => cmd_analyze(args), } } @@ -213,11 +240,63 @@ fn cmd_run(args: RunArgs) -> Result<()> { Ok(()) } +fn cmd_run_from_api(args: RunFromApiArgs) -> Result<()> { + let api = ApiClient::from_env() + .context("failed to initialize API client")? + .context("LLM_BENCHMARK_UPLOAD_URL required for run-from-api")?; + if let Err(e) = api.update_run_status(&args.run_id, "running", None) { + eprintln!("[warn] failed to mark website benchmark run as running: {e:#}"); + } + + let result = cmd_run_from_api_inner(&api, &args.run_id); + match result { + Ok(()) => { + if let Err(e) = api.update_run_status(&args.run_id, "succeeded", None) { + eprintln!("[warn] failed to mark website benchmark run as succeeded: {e:#}"); + } + Ok(()) + } + Err(e) => { + let message = format!("{e:#}"); + if let Err(status_err) = api.update_run_status(&args.run_id, "failed", Some(&message)) { + eprintln!("[warn] failed to mark website benchmark run as failed: {status_err:#}"); + } + Err(e) + } + } +} + +fn cmd_run_from_api_inner(api: &ApiClient, run_id: &str) -> Result<()> { + let spec = api.fetch_run_spec(run_id)?; + + for lang in &spec.languages { + run_benchmarks(RunArgs { + modes: Some(spec.modes.clone()), + lang: *lang, + hash_only: false, + goldens_only: false, + force: false, + categories: spec.categories.clone(), + tasks: spec.tasks.clone(), + providers: None, + models: None, + model_source: ModelSource::Static, + dry_run: false, + local_analysis: false, + route_overrides: Some(spec.routes.clone()), + run_id: Some(spec.run_id.clone()), + })?; + } + + Ok(()) +} + /// Core benchmark runner used by both `run` and `ci-quickfix` fn run_benchmarks(args: RunArgs) -> Result<()> { let dry_run = args.dry_run; let local_analysis = args.local_analysis; let dry_run_id = dry_run.then(|| chrono::Utc::now().format("%Y-%m-%d_%H%M%S").to_string()); + let should_fetch_remote_routes = should_fetch_remote_routes(&args); let api_client = if dry_run { None @@ -244,8 +323,17 @@ fn run_benchmarks(args: RunArgs) -> Result<()> { dry_run, local_analysis, dry_run_id: dry_run_id.clone(), + run_id: args.run_id, + route_overrides: args.route_overrides, }; + if should_fetch_remote_routes { + let api = api_client + .as_ref() + .context("LLM_BENCHMARK_UPLOAD_URL required when --model-source remote is used")?; + config.route_overrides = Some(api.fetch_model_routes()?); + } + let bench_root = find_bench_root(); // Upload task catalog before running benchmarks @@ -396,10 +484,10 @@ fn cmd_analyze(args: AnalyzeArgs) -> Result<()> { let provider = make_provider_from_env()?; let analysis_route = ModelRoute::new( - "gpt-4.1-mini", + "gpt-5.4-mini", xtask_llm_benchmark::llm::types::Vendor::OpenAi, - "gpt-4.1-mini", - Some("openai/gpt-4.1-mini"), + "gpt-5.4-mini", + Some("openai/gpt-5.4-mini"), ); for ((lang, mode, model), group_failures) in &groups { @@ -534,6 +622,15 @@ fn short_hash(s: &str) -> &str { &s[..s.len().min(12)] } +fn should_fetch_remote_routes(args: &RunArgs) -> bool { + args.model_source == ModelSource::Remote + && args.models.is_none() + && args.route_overrides.is_none() + && !args.dry_run + && !args.hash_only + && !args.goldens_only +} + fn preflight_llm_routes( runtime: &Runtime, llm_provider: &dyn LlmProvider, @@ -651,7 +748,12 @@ fn run_mode_benchmarks( /// When explicit `openrouter:vendor/model` entries are passed they won't appear in /// `default_model_routes`, so we synthesize ad-hoc routes for them here. fn filter_routes(config: &RunConfig) -> Vec { - let mut routes: Vec = default_model_routes() + let base_routes: Vec = config + .route_overrides + .clone() + .unwrap_or_else(|| default_model_routes().to_vec()); + + let mut routes: Vec = base_routes .iter() .filter(|r| config.providers_filter.as_ref().is_none_or(|f| f.contains(&r.vendor))) .filter(|r| match &config.model_filter { @@ -710,11 +812,13 @@ async fn run_many_routes_for_mode( let dry_run = config.dry_run; let local_analysis = config.local_analysis; let dry_run_id = config.dry_run_id.clone(); + let run_id = config.run_id.clone(); futures::stream::iter(routes.iter().map(|route| { let host = host.clone(); let api_client = api_client.clone(); let dry_run_id = dry_run_id.clone(); + let run_id = run_id.clone(); async move { println!("\u{2192} running {}", route.display_name); @@ -733,6 +837,7 @@ async fn run_many_routes_for_mode( dry_run, local_analysis, dry_run_id, + run_id, }; let outcomes = run_selected_or_all_for_model_async_for_lang(&per).await?; @@ -806,8 +911,8 @@ fn find_bench_root() -> PathBuf { start.join("src").join("benchmarks") } -fn collect_task_numbers_in_categories(bench_root: &Path, cats: &HashSet) -> Result> { - let mut nums = HashSet::new(); +fn collect_task_names_in_categories(bench_root: &Path, cats: &HashSet) -> Result> { + let mut tasks = HashSet::new(); for c in cats { let dir = bench_root.join(c); if !dir.is_dir() { @@ -818,24 +923,38 @@ fn collect_task_numbers_in_categories(bench_root: &Path, cats: &HashSet) if !entry.file_type()?.is_dir() { continue; } - let name = entry.file_name().to_string_lossy().into_owned(); - if let Some(rest) = name.strip_prefix("t_") - && let Some((num_str, _)) = rest.split_once('_') - && num_str.len() == 3 - && let Ok(n) = num_str.parse::() - { - nums.insert(n); - } + tasks.insert(entry.file_name().to_string_lossy().to_ascii_lowercase()); } } - Ok(nums) + Ok(tasks) } -fn normalize_numeric_selectors(raw: &[String]) -> Vec { - raw.iter() - .filter(|s| !s.is_empty() && s.chars().all(|c| c.is_ascii_digit())) - .filter_map(|s| s.parse::().ok()) - .collect() +fn task_selector_matches_any(selector: &str, allowed_tasks: &HashSet) -> bool { + allowed_tasks.iter().any(|task| task.starts_with(selector)) +} + +fn normalize_task_filter_selector(raw: &str) -> Result { + let s = raw.trim().to_ascii_lowercase(); + if s.is_empty() { + anyhow::bail!("empty task selector"); + } + if let Some(rest) = s.strip_prefix("t_") { + if rest.chars().all(|c| c.is_ascii_digit()) { + let n: u32 = rest.parse()?; + return Ok(format!("t_{:03}", n)); + } + if rest.chars().next().is_some_and(|c| c.is_ascii_digit()) + && rest.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') + { + return Ok(s); + } + anyhow::bail!("invalid task selector: {raw}"); + } + if s.chars().all(|c| c.is_ascii_digit()) { + let n: u32 = s.parse()?; + return Ok(format!("t_{:03}", n)); + } + anyhow::bail!("invalid task selector: {raw}") } fn apply_category_filter( @@ -849,23 +968,126 @@ fn apply_category_filter( Ok(selectors.map(|s| s.to_vec())) } Some(cats) => { - let allowed = collect_task_numbers_in_categories(bench_root, cats)?; - let out_nums: Vec = match selectors { + let allowed = collect_task_names_in_categories(bench_root, cats)?; + let mut out: Vec = match selectors { Some(user) => { - let nums = normalize_numeric_selectors(user); - nums.into_iter().filter(|n| allowed.contains(n)).collect() + let mut selected = Vec::new(); + for selector in user { + let normalized = normalize_task_filter_selector(selector)?; + if task_selector_matches_any(&normalized, &allowed) { + selected.push(normalized); + } + } + selected } None => { - let mut v: Vec = allowed.into_iter().collect(); + let mut v: Vec = allowed.into_iter().collect(); v.sort_unstable(); v } }; - if out_nums.is_empty() { - Ok(None) - } else { - Ok(Some(out_nums.into_iter().map(|n| n.to_string()).collect())) + out.sort(); + out.dedup(); + if out.is_empty() { + anyhow::bail!("no tasks matched category/task filters"); } + Ok(Some(out)) } } } + +#[cfg(test)] +mod tests { + use super::*; + + fn base_run_args() -> RunArgs { + RunArgs { + modes: None, + lang: Lang::Rust, + hash_only: false, + goldens_only: false, + force: false, + categories: None, + tasks: None, + providers: None, + models: None, + model_source: ModelSource::Static, + dry_run: false, + local_analysis: false, + route_overrides: None, + run_id: None, + } + } + + fn base_config(route_overrides: Option>) -> RunConfig { + RunConfig { + modes: None, + hash_only: false, + goldens_only: false, + lang: Lang::Rust, + providers_filter: None, + selectors: None, + force: false, + categories: None, + model_filter: None, + host: None, + api_client: None, + dry_run: false, + local_analysis: false, + dry_run_id: None, + run_id: None, + route_overrides, + } + } + + #[test] + fn remote_model_source_fetches_only_for_implicit_models() { + let mut args = base_run_args(); + args.model_source = ModelSource::Remote; + assert!(should_fetch_remote_routes(&args)); + + args.models = Some(vec![ModelGroup { + vendor: Vendor::OpenAi, + models: vec!["gpt-test".to_string()], + }]); + assert!(!should_fetch_remote_routes(&args)); + } + + #[test] + fn filter_routes_uses_remote_route_override() { + let remote_route = ModelRoute::new( + "Remote Model", + Vendor::OpenRouter, + "openai/remote-model", + Some("openai/remote-model"), + ); + let config = base_config(Some(vec![remote_route])); + + let routes = filter_routes(&config); + assert_eq!(routes.len(), 1); + assert_eq!(routes[0].display_name, "Remote Model"); + assert_eq!(routes[0].api_model, "openai/remote-model"); + } + + #[test] + fn category_filter_accepts_full_task_ids() { + let root = std::env::temp_dir().join(format!( + "llm-benchmark-test-{}", + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + )); + fs::create_dir_all(root.join("basics").join("t_001_basic_tables")).unwrap(); + fs::create_dir_all(root.join("schema").join("t_012_product_type")).unwrap(); + + let mut categories = HashSet::new(); + categories.insert("basics".to_string()); + let selectors = vec!["t_001_basic_tables".to_string(), "t_012_product_type".to_string()]; + + let filtered = apply_category_filter(&root, Some(&categories), Some(&selectors)).unwrap(); + fs::remove_dir_all(&root).unwrap(); + + assert_eq!(filtered, Some(vec!["t_001_basic_tables".to_string()])); + } +} diff --git a/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs b/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs index c7a057c4638..8bb0d1ac734 100644 --- a/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs +++ b/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs @@ -237,6 +237,12 @@ fn anthropic_max_output_tokens() -> u32 { pub fn normalize_anthropic_model(id: &str) -> &str { let lid = id.to_ascii_lowercase().replace('_', "-"); match lid.as_str() { + // Opus 4.8 + "opus-4.8" | "claude-opus-4.8" | "claude-opus-4-8" => "claude-opus-4-8", + + // Sonnet 4.6 + "sonnet-4.6" | "claude-sonnet-4.6" | "claude-sonnet-4-6" => "claude-sonnet-4-6", + // Sonnet 4.5 "sonnet-4.5" | "claude-sonnet-4.5" | "claude-sonnet-4-5" => "claude-sonnet-4-5", "claude-sonnet-4-5-20250929" => "claude-sonnet-4-5-20250929", diff --git a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs index d35ce789d17..54e0532db34 100644 --- a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs +++ b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs @@ -275,26 +275,44 @@ pub fn openrouter_ctx_limit_tokens(model: &str) -> usize { // Anthropic if m.contains("claude") { + if m.contains("4.6") + || m.contains("4-6") + || m.contains("4.7") + || m.contains("4-7") + || m.contains("4.8") + || m.contains("4-8") + { + return 1_000_000; + } return 185_000; } // OpenAI + if m.contains("gpt-5.5") { + return 1_050_000; + } if m.contains("gpt-5") || m.contains("gpt-4.1") { return 400_000; } if m.contains("gpt-4o") || m.contains("gpt-4") { return 128_000; } - // xAI / Grok — leave ~50 k headroom for segments + output on top of trimmed prefix - if m.contains("grok-code-fast") { + // xAI / Grok + if m.contains("grok-build-0.1") || m.contains("grok-code-fast") { return 200_000; } + if m.contains("grok-4.3") { + return 1_000_000; + } if m.contains("grok-4") { return 200_000; } if m.contains("grok") { return 90_000; } - // DeepSeek — hard cap is 131 072 on OpenRouter; leave ~25 k headroom for segments + output + // DeepSeek + if m.contains("deepseek-v4") { + return 1_000_000; + } if m.contains("deepseek") { return 106_000; } diff --git a/tools/xtask-llm-benchmark/src/llm/model_routes.rs b/tools/xtask-llm-benchmark/src/llm/model_routes.rs index e136976adb6..7f7ae93b66c 100644 --- a/tools/xtask-llm-benchmark/src/llm/model_routes.rs +++ b/tools/xtask-llm-benchmark/src/llm/model_routes.rs @@ -13,16 +13,16 @@ pub struct ModelRoute { static DEFAULT_ROUTES: LazyLock> = LazyLock::new(|| { use Vendor::*; vec![ - // OpenAI: Best GPT-5.2-Codex, Cheaper GPT-5-mini - ModelRoute::new("GPT-5.2-Codex", OpenAi, "gpt-5.2-codex", Some("openai/gpt-5.2-codex")), - ModelRoute::new("GPT-5-mini", OpenAi, "gpt-5-mini", Some("openai/gpt-5-mini")), - // Claude: Best Opus 4.6, Cheaper Sonnet 4.6 - // Direct API uses dashes (claude-opus-4-6); OpenRouter uses dots (claude-opus-4.6) + // OpenAI: Best GPT-5.5, Cheaper GPT-5.4-mini + ModelRoute::new("GPT-5.5", OpenAi, "gpt-5.5", Some("openai/gpt-5.5")), + ModelRoute::new("GPT-5.4-mini", OpenAi, "gpt-5.4-mini", Some("openai/gpt-5.4-mini")), + // Claude: Best Opus 4.8, Cheaper Sonnet 4.6 + // Direct API uses dashes (claude-opus-4-8); OpenRouter uses dots (claude-opus-4.8) ModelRoute::new( - "Claude Opus 4.6", + "Claude Opus 4.8", Anthropic, - "claude-opus-4-6", - Some("anthropic/claude-opus-4.6"), + "claude-opus-4-8", + Some("anthropic/claude-opus-4.8"), ), ModelRoute::new( "Claude Sonnet 4.6", @@ -30,9 +30,9 @@ static DEFAULT_ROUTES: LazyLock> = LazyLock::new(|| { "claude-sonnet-4-6", Some("anthropic/claude-sonnet-4.6"), ), - // Grok: Best Grok 4, Cheaper Grok Code - ModelRoute::new("Grok 4", Xai, "grok-4", Some("x-ai/grok-4.20-beta")), - ModelRoute::new("Grok Code", Xai, "grok-code-fast-1", Some("x-ai/grok-code-fast-1")), + // Grok: Best Grok 4.3, coding-specialized Grok Build + ModelRoute::new("Grok 4.3", Xai, "grok-4.3", Some("x-ai/grok-4.3")), + ModelRoute::new("Grok Build 0.1", Xai, "grok-build-0.1", Some("x-ai/grok-build-0.1")), // Gemini: direct via GOOGLE_API_KEY, falls back to OpenRouter if not set ModelRoute::new( "Gemini 3.1 Pro", @@ -41,24 +41,23 @@ static DEFAULT_ROUTES: LazyLock> = LazyLock::new(|| { Some("google/gemini-3.1-pro-preview"), ), ModelRoute::new( - "Gemini 3 Flash", + "Gemini 3.5 Flash", Google, - "gemini-3-flash-preview", - Some("google/gemini-3-flash-preview"), + "gemini-3.5-flash", + Some("google/gemini-3.5-flash"), ), - // DeepSeek: Reasoner (thinking), Chat (general) - // deepseek-reasoner is listed as deepseek-r1 on OpenRouter + // DeepSeek: Pro (highest capability), Flash (cheaper/faster) ModelRoute::new( - "DeepSeek Reasoner", + "DeepSeek V4 Pro", DeepSeek, - "deepseek-reasoner", - Some("deepseek/deepseek-r1"), + "deepseek-v4-pro", + Some("deepseek/deepseek-v4-pro"), ), ModelRoute::new( - "DeepSeek Chat", + "DeepSeek V4 Flash", DeepSeek, - "deepseek-chat", - Some("deepseek/deepseek-chat"), + "deepseek-v4-flash", + Some("deepseek/deepseek-v4-flash"), ), ] }); diff --git a/tools/xtask-llm-benchmark/src/llm/segmentation.rs b/tools/xtask-llm-benchmark/src/llm/segmentation.rs index 2926852ada0..26bc481e52f 100644 --- a/tools/xtask-llm-benchmark/src/llm/segmentation.rs +++ b/tools/xtask-llm-benchmark/src/llm/segmentation.rs @@ -88,14 +88,29 @@ pub fn build_anthropic_messages( } // Provider-specific context limits -pub fn anthropic_ctx_limit_tokens(_model: &str) -> usize { - // Anthropic hard limit is 200k; reserve ~15k for tokenizer variance + system/segments +pub fn anthropic_ctx_limit_tokens(model: &str) -> usize { + let m = model.to_ascii_lowercase(); + + // Newer Claude 4.6+ models expose a 1M context window. + if m.contains("4-6") + || m.contains("4.6") + || m.contains("4-7") + || m.contains("4.7") + || m.contains("4-8") + || m.contains("4.8") + { + return 1_000_000; + } + + // Older Anthropic models are 200k; reserve ~15k for tokenizer variance + system/segments. 185_000 } pub fn openai_ctx_limit_tokens(model: &str) -> usize { let m = model.to_ascii_lowercase(); - if m.contains("gpt-5") || m.contains("gpt-4.1") { + if m.contains("gpt-5.5") { + 1_050_000 + } else if m.contains("gpt-5") || m.contains("gpt-4.1") { 400_000 } else { 128_000 @@ -105,7 +120,13 @@ pub fn openai_ctx_limit_tokens(model: &str) -> usize { pub fn deepseek_ctx_limit_tokens(model: &str) -> usize { let m = model.to_ascii_lowercase(); - // API limit 128K for deepseek-chat and deepseek-reasoner + if m.starts_with("deepseek-v4") { + return 1_000_000; + } + if m.starts_with("deepseek-v3.2") { + return 128_000; + } + // API limit 128K for deepseek-chat and deepseek-reasoner compatibility aliases. if m.starts_with("deepseek-reasoner") || m.starts_with("deepseek-r1") { return 128_000; } @@ -123,8 +144,8 @@ pub fn deepseek_ctx_limit_tokens(model: &str) -> usize { pub fn gemini_ctx_limit_tokens(model: &str) -> usize { let m = model.to_ascii_lowercase(); - // Gemini 2.5 series (very large) - if m.contains("2.5") && (m.contains("pro") || m.contains("flash")) { + // Gemini 3.x and 2.5 series (very large) + if (m.contains("3.") || m.contains("2.5")) && (m.contains("pro") || m.contains("flash")) { return 1_000_000; } @@ -160,9 +181,12 @@ pub fn meta_ctx_limit_tokens(model: &str) -> usize { pub fn xai_ctx_limit_tokens(model: &str) -> usize { let m = model.to_ascii_lowercase(); - if m.contains("grok-code-fast-1") { + if m.contains("grok-build-0.1") || m.contains("grok-code-fast-1") { return 256_000; } + if m.contains("grok-4.3") { + return 1_000_000; + } if m.contains("grok-4") || m.contains("grok-3") { return 128_000; } From bcdb41de96cebae1eb721b5a148e074c251fac40 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 09:13:52 -0400 Subject: [PATCH 04/25] preflight credit checks; workflow update to use web --- .github/workflows/llm-benchmark-periodic.yml | 36 ++- tools/xtask-llm-benchmark/src/api/client.rs | 265 ++++-------------- tools/xtask-llm-benchmark/src/bench/runner.rs | 30 +- tools/xtask-llm-benchmark/src/bench/types.rs | 3 - .../src/bin/llm_benchmark.rs | 98 ++----- .../src/llm/clients/mod.rs | 4 +- .../src/llm/clients/openrouter.rs | 120 +++++++- 7 files changed, 239 insertions(+), 317 deletions(-) diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml index c0d9cc93d8c..183ba1c0ea9 100644 --- a/.github/workflows/llm-benchmark-periodic.yml +++ b/.github/workflows/llm-benchmark-periodic.yml @@ -6,10 +6,6 @@ on: - cron: '0 0 * * 1' workflow_dispatch: inputs: - run_id: - description: 'Website-created benchmark run id. When set, run spec is fetched from the website.' - required: false - default: '' models: description: 'Models to run (provider:model format, comma-separated, or "all")' required: false @@ -22,6 +18,14 @@ on: description: 'Modes to run (comma-separated: guidelines,no_context,docs,...)' required: false default: 'guidelines,no_context' + categories: + description: 'Optional benchmark categories to run (comma-separated)' + required: false + default: '' + tasks: + description: 'Optional benchmark task ids/selectors to run (comma-separated)' + required: false + default: '' permissions: contents: read @@ -92,33 +96,39 @@ jobs: MSBUILDDISABLENODEREUSE: "1" DOTNET_CLI_USE_MSBUILD_SERVER: "0" LLM_BENCH_CSHARP_CONCURRENCY: "1" - INPUT_RUN_ID: ${{ inputs.run_id || '' }} INPUT_LANGUAGES: ${{ inputs.languages || 'rust,csharp,typescript' }} INPUT_MODELS: ${{ inputs.models || 'all' }} INPUT_MODES: ${{ inputs.modes || 'guidelines,no_context' }} + INPUT_CATEGORIES: ${{ inputs.categories || '' }} + INPUT_TASKS: ${{ inputs.tasks || '' }} run: | - RUN_ID="$INPUT_RUN_ID" LANGS="$INPUT_LANGUAGES" MODELS="$INPUT_MODELS" MODES="$INPUT_MODES" - - if [ -n "$RUN_ID" ]; then - llm_benchmark run-from-api --run-id "$RUN_ID" - exit $? - fi + CATEGORIES="$INPUT_CATEGORIES" + TASKS="$INPUT_TASKS" SUCCEEDED=0 FAILED=0 for LANG in $(echo "$LANGS" | tr ',' ' '); do + EXTRA_ARGS=() + if [ -n "$CATEGORIES" ]; then + EXTRA_ARGS+=(--categories "$CATEGORIES") + fi + if [ -n "$TASKS" ]; then + EXTRA_ARGS+=(--tasks "$TASKS") + fi + if [ "$MODELS" = "all" ]; then - if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote; then + if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote "${EXTRA_ARGS[@]}"; then SUCCEEDED=$((SUCCEEDED + 1)) else echo "::warning::Benchmark run failed for lang=$LANG" FAILED=$((FAILED + 1)) fi else - if llm_benchmark run --lang "$LANG" --modes "$MODES" --models "$MODELS"; then + read -r -a MODEL_ARGS <<< "$MODELS" + if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote --models "${MODEL_ARGS[@]}" "${EXTRA_ARGS[@]}"; then SUCCEEDED=$((SUCCEEDED + 1)) else echo "::warning::Benchmark run failed for lang=$LANG models=$MODELS" diff --git a/tools/xtask-llm-benchmark/src/api/client.rs b/tools/xtask-llm-benchmark/src/api/client.rs index 13fde0f8e81..0b43ccb5bac 100644 --- a/tools/xtask-llm-benchmark/src/api/client.rs +++ b/tools/xtask-llm-benchmark/src/api/client.rs @@ -1,53 +1,45 @@ use anyhow::{anyhow, Context, Result}; -use serde::Deserialize; use serde_json::json; -use std::str::FromStr; use crate::bench::normalize::{canonical_mode, normalize_model_names}; use crate::bench::types::{Results, RunOutcome}; -use crate::eval::Lang; use crate::llm::types::Vendor; use crate::llm::ModelRoute; -#[derive(Debug, Clone)] -pub struct RemoteRunSpec { - pub run_id: String, - pub languages: Vec, - pub modes: Vec, - pub routes: Vec, - pub categories: Option>, - pub tasks: Option>, -} - -#[derive(Debug, Deserialize)] +#[derive(Debug)] struct RemoteModelRouteRow { - #[serde(alias = "displayName", alias = "name")] display_name: String, vendor: String, - #[serde(alias = "apiModel")] api_model: String, - #[serde(default, alias = "openrouterModel")] openrouter_model: Option, - #[serde(default)] active: Option, - #[serde(default)] available: Option, } -#[derive(Debug, Deserialize)] -struct RawRunSpec { - #[serde(default)] - id: Option, - #[serde(default, alias = "runId")] - run_id: Option, - languages: Vec, - modes: Vec, - #[serde(default, alias = "routes")] - models: Vec, - #[serde(default)] - categories: Option>, - #[serde(default)] - tasks: Option>, +fn read_string_field(row: &serde_json::Map, keys: &[&str]) -> Option { + keys.iter() + .find_map(|key| row.get(*key).and_then(|value| value.as_str())) + .map(str::to_string) +} + +fn read_bool_field(row: &serde_json::Map, keys: &[&str]) -> Option { + keys.iter() + .find_map(|key| row.get(*key).and_then(|value| value.as_bool())) +} + +fn parse_model_route_value(value: serde_json::Value) -> Result { + let row = value + .as_object() + .ok_or_else(|| anyhow!("remote model row must be an object"))?; + + Ok(RemoteModelRouteRow { + display_name: read_string_field(row, &["display_name", "displayName", "name"]).unwrap_or_default(), + vendor: read_string_field(row, &["vendor"]).unwrap_or_default(), + api_model: read_string_field(row, &["api_model", "apiModel"]).unwrap_or_default(), + openrouter_model: read_string_field(row, &["openrouter_model", "openrouterModel"]), + active: read_bool_field(row, &["active"]), + available: read_bool_field(row, &["available"]), + }) } fn parse_model_route_row(row: RemoteModelRouteRow) -> Result> { @@ -76,11 +68,12 @@ fn parse_model_route_row(row: RemoteModelRouteRow) -> Result> pub fn parse_model_routes_response(body: &serde_json::Value) -> Result> { let models = body.get("models").unwrap_or(body); - let rows: Vec = + let rows: Vec = serde_json::from_value(models.clone()).context("parse llm benchmark model rows")?; let mut routes = Vec::new(); - for row in rows { + for row in rows.into_iter().map(parse_model_route_value) { + let row = row?; if let Some(route) = parse_model_route_row(row)? { routes.push(route); } @@ -93,83 +86,33 @@ pub fn parse_model_routes_response(body: &serde_json::Value) -> Result Result { - let spec = body.get("spec").or_else(|| body.get("spec_json")).unwrap_or(body); - let spec = match spec.as_str() { - Some(s) => serde_json::from_str::(s).context("parse run spec_json string")?, - None => spec.clone(), - }; - - let raw: RawRunSpec = serde_json::from_value(spec).context("parse llm benchmark run spec")?; - let run_id = raw.run_id.or(raw.id).unwrap_or_else(|| fallback_run_id.to_string()); - - let languages = raw - .languages - .iter() - .map(|lang| Lang::from_str(lang).map_err(|e| anyhow!(e))) - .collect::>>()?; - if languages.is_empty() { - anyhow::bail!("run spec '{}' has no languages", run_id); - } - - let modes: Vec = raw - .modes - .into_iter() - .map(|mode| mode.trim().to_string()) - .filter(|mode| !mode.is_empty()) - .collect(); - if modes.is_empty() { - anyhow::bail!("run spec '{}' has no modes", run_id); - } - - let mut routes = Vec::new(); - for row in raw.models { - if let Some(route) = parse_model_route_row(row)? { - routes.push(route); - } - } - if routes.is_empty() { - anyhow::bail!("run spec '{}' has no active available models", run_id); - } - - Ok(RemoteRunSpec { - run_id, - languages, - modes, - routes, - categories: raw.categories, - tasks: raw.tasks, - }) -} - /// HTTP client for the SpacetimeDB LLM benchmark API (spacetime-web Postgres). /// /// Supports endpoints owned by spacetime-web: /// - `POST /api/llm-benchmark-upload` - upload benchmark results /// - `POST /api/llm-benchmark-tasks` - upload task catalog /// - `GET /api/llm-benchmark-models?active=true` - fetch active benchmark models -/// - `GET /api/llm-benchmark-runs/{run_id}` - fetch admin-triggered run specs -/// - `PATCH /api/llm-benchmark-runs/{run_id}` - update admin-triggered run status #[derive(Clone)] pub struct ApiClient { - client: reqwest::blocking::Client, base_url: String, api_key: String, } impl ApiClient { pub fn new(base_url: &str, api_key: &str) -> Result { - let client = reqwest::blocking::Client::builder() - .timeout(std::time::Duration::from_secs(120)) - .build() - .context("failed to build HTTP client")?; Ok(Self { - client, base_url: base_url.trim_end_matches('/').to_string(), api_key: api_key.to_string(), }) } + fn client(&self) -> Result { + reqwest::blocking::Client::builder() + .timeout(std::time::Duration::from_secs(120)) + .build() + .context("failed to build HTTP client") + } + /// Build from environment variables `LLM_BENCHMARK_UPLOAD_URL` and `LLM_BENCHMARK_API_KEY`. /// Returns `None` if `LLM_BENCHMARK_UPLOAD_URL` is not set. pub fn from_env() -> Result> { @@ -185,13 +128,7 @@ impl ApiClient { /// Upload a batch of run outcomes for a single (lang, mode) combination. /// Normalizes model names and sanitizes volatile fields before upload. /// If `analysis` is provided, it is stored in the `llm_benchmark_analysis` table. - pub fn upload_batch( - &self, - mode: &str, - outcomes: &[RunOutcome], - analysis: Option<&str>, - run_id: Option<&str>, - ) -> Result { + pub fn upload_batch(&self, mode: &str, outcomes: &[RunOutcome], analysis: Option<&str>) -> Result { if outcomes.is_empty() { return Ok(0); } @@ -218,6 +155,7 @@ impl ApiClient { normalize_model_names(&mut results); let url = format!("{}/api/llm-benchmark-upload", self.base_url); + let client = self.client()?; let mut total_uploaded = 0usize; for lang_entry in &results.languages { @@ -232,18 +170,14 @@ impl ApiClient { } } - let mut payload = json!({ + let payload = json!({ "lang": lang_entry.lang, "mode": mode_entry.mode, "hash": mode_entry.hash, "models": models_json, }); - if let Some(run_id) = run_id { - payload["run_id"] = json!(run_id); - } - let resp = self - .client + let resp = client .post(&url) .header("Authorization", format!("Bearer {}", self.api_key)) .header("Content-Type", "application/json") @@ -280,7 +214,7 @@ impl ApiClient { pub fn fetch_model_routes(&self) -> Result> { let url = format!("{}/api/llm-benchmark-models?active=true", self.base_url); let resp = self - .client + .client()? .get(&url) .header("Authorization", format!("Bearer {}", self.api_key)) .send() @@ -296,80 +230,6 @@ impl ApiClient { } } - /// Fetch an immutable website-created run spec for admin-triggered runs. - pub fn fetch_run_spec(&self, run_id: &str) -> Result { - let run_id_path = urlencoding::encode(run_id); - let url = format!("{}/api/llm-benchmark-runs/{}", self.base_url, run_id_path); - let resp = self - .client - .get(&url) - .header("Authorization", format!("Bearer {}", self.api_key)) - .send() - .with_context(|| format!("fetch LLM benchmark run spec failed for {run_id}"))?; - - if resp.status().is_success() { - let body: serde_json::Value = resp.json().context("parse run spec response")?; - parse_run_spec_response(&body, run_id) - } else { - let status = resp.status(); - let body = resp.text().unwrap_or_default(); - anyhow::bail!( - "fetch LLM benchmark run spec failed for {}: {} - {}", - run_id, - status, - body - ); - } - } - - /// Update website-created benchmark run status. - pub fn update_run_status(&self, run_id: &str, status: &str, error: Option<&str>) -> Result<()> { - let run_id_path = urlencoding::encode(run_id); - let url = format!("{}/api/llm-benchmark-runs/{}", self.base_url, run_id_path); - let mut payload = json!({ - "status": status, - }); - if let Some(error) = error { - payload["error"] = json!(error); - } - if let Ok(github_run_id) = std::env::var("GITHUB_RUN_ID") - && !github_run_id.is_empty() - { - payload["github_run_id"] = json!(github_run_id); - if let (Ok(server_url), Ok(repo)) = (std::env::var("GITHUB_SERVER_URL"), std::env::var("GITHUB_REPOSITORY")) - { - payload["github_run_url"] = json!(format!( - "{}/{}/actions/runs/{}", - server_url.trim_end_matches('/'), - repo, - payload["github_run_id"].as_str().unwrap_or_default() - )); - } - } - - let resp = self - .client - .patch(&url) - .header("Authorization", format!("Bearer {}", self.api_key)) - .header("Content-Type", "application/json") - .json(&payload) - .send() - .with_context(|| format!("update LLM benchmark run status failed for {run_id}"))?; - - if resp.status().is_success() { - Ok(()) - } else { - let status_code = resp.status(); - let body = resp.text().unwrap_or_default(); - anyhow::bail!( - "update LLM benchmark run status failed for {}: {} - {}", - run_id, - status_code, - body - ); - } - } - /// Upload the task catalog to `POST /api/llm-benchmark-tasks`, derived from /// the benchmarks directory structure on disk. pub fn upload_task_catalog(&self, bench_root: &std::path::Path) -> Result { @@ -451,7 +311,7 @@ impl ApiClient { let payload = json!({ "categories": categories }); let resp = self - .client + .client()? .post(&url) .header("Authorization", format!("Bearer {}", self.api_key)) .header("Content-Type", "application/json") @@ -483,7 +343,7 @@ impl ApiClient { let url = format!("{}/api/llm-benchmark-results?{}", self.base_url, params.join("&")); let resp = self - .client + .client()? .get(&url) .header("Authorization", format!("Bearer {}", self.api_key)) .send() @@ -526,7 +386,7 @@ impl ApiClient { let url = format!("{}/api/llm-benchmark-results?{}", self.base_url, params.join("&")); let resp = self - .client + .client()? .get(&url) .header("Authorization", format!("Bearer {}", self.api_key)) .send() @@ -560,7 +420,7 @@ impl ApiClient { let url = format!("{}/api/llm-benchmark-upload", self.base_url); let resp = self - .client + .client()? .post(&url) .header("Authorization", format!("Bearer {}", self.api_key)) .header("Content-Type", "application/json") @@ -621,29 +481,24 @@ mod tests { } #[test] - fn parses_run_spec_response() { + fn parses_snake_case_model_route_fields() { let body = json!({ - "spec_json": { - "languages": ["rust", "typescript"], - "modes": ["guidelines", "no_context"], - "categories": ["basics"], - "tasks": ["t_001_basic_tables"], - "models": [{ - "display_name": "Claude Test", - "vendor": "anthropic", - "api_model": "claude-test", - "openrouter_model": "anthropic/claude-test" - }] - } + "models": [ + { + "display_name": "GPT Test", + "vendor": "openai", + "api_model": "gpt-test", + "openrouter_model": "openai/gpt-test", + "active": true, + "available": true + } + ] }); - let spec = parse_run_spec_response(&body, "run-123").unwrap(); - assert_eq!(spec.run_id, "run-123"); - assert_eq!(spec.languages, vec![Lang::Rust, Lang::TypeScript]); - assert_eq!(spec.modes, vec!["guidelines", "no_context"]); - assert_eq!(spec.categories.as_deref(), Some(&["basics".to_string()][..])); - assert_eq!(spec.tasks.as_deref(), Some(&["t_001_basic_tables".to_string()][..])); - assert_eq!(spec.routes.len(), 1); - assert_eq!(spec.routes[0].vendor, Vendor::Anthropic); + let routes = parse_model_routes_response(&body).unwrap(); + assert_eq!(routes.len(), 1); + assert_eq!(routes[0].display_name, "GPT Test"); + assert_eq!(routes[0].api_model, "gpt-test"); + assert_eq!(routes[0].openrouter_model.as_deref(), Some("openai/gpt-test")); } } diff --git a/tools/xtask-llm-benchmark/src/bench/runner.rs b/tools/xtask-llm-benchmark/src/bench/runner.rs index 92acc429b70..2536b5e5fe1 100644 --- a/tools/xtask-llm-benchmark/src/bench/runner.rs +++ b/tools/xtask-llm-benchmark/src/bench/runner.rs @@ -473,6 +473,23 @@ async fn maybe_generate_analysis(cfg: &BenchRunContext<'_>, outcomes: &[RunOutco Ok(analysis) } +async fn upload_batch_for_context( + cfg: &BenchRunContext<'_>, + outcomes: &[RunOutcome], + analysis: Option<&str>, +) -> Result<()> { + if let Some(api) = cfg.api_client.clone() { + let mode = cfg.mode.to_string(); + let outcomes = outcomes.to_vec(); + let analysis = analysis.map(str::to_string); + tokio::task::spawn_blocking(move || api.upload_batch(&mode, &outcomes, analysis.as_deref())).await??; + } else { + eprintln!("[runner] no API client configured; skipping upload"); + } + + Ok(()) +} + pub async fn run_all_for_model_async_for_lang(cfg: &BenchRunContext<'_>) -> Result> { let total_wall = Instant::now(); @@ -632,11 +649,7 @@ pub async fn run_all_for_model_async_for_lang(cfg: &BenchRunContext<'_>) -> Resu None } }; - if let Some(ref api) = cfg.api_client { - api.upload_batch(cfg.mode, &outcomes, analysis.as_deref(), cfg.run_id.as_deref())?; - } else { - eprintln!("[runner] no API client configured; skipping upload"); - } + upload_batch_for_context(cfg, &outcomes, analysis.as_deref()).await?; } else { eprintln!("[runner] no results; skipping upload"); } @@ -831,11 +844,7 @@ pub async fn run_selected_for_model_async_for_lang(cfg: &BenchRunContext<'_>) -> None } }; - if let Some(ref api) = cfg.api_client { - api.upload_batch(cfg.mode, &outcomes, analysis.as_deref(), cfg.run_id.as_deref())?; - } else { - eprintln!("[runner] no API client configured; skipping upload"); - } + upload_batch_for_context(cfg, &outcomes, analysis.as_deref()).await?; } println!( @@ -865,7 +874,6 @@ pub async fn run_selected_or_all_for_model_async_for_lang(ctx: &BenchRunContext< dry_run: ctx.dry_run, local_analysis: ctx.local_analysis, dry_run_id: ctx.dry_run_id.clone(), - run_id: ctx.run_id.clone(), }; return run_selected_for_model_async_for_lang(&sel_cfg).await; } diff --git a/tools/xtask-llm-benchmark/src/bench/types.rs b/tools/xtask-llm-benchmark/src/bench/types.rs index 57bbed9a149..e54df0d4902 100644 --- a/tools/xtask-llm-benchmark/src/bench/types.rs +++ b/tools/xtask-llm-benchmark/src/bench/types.rs @@ -188,7 +188,6 @@ pub struct BenchRunContext<'a> { pub dry_run: bool, pub local_analysis: bool, pub dry_run_id: Option, - pub run_id: Option, } pub struct RunConfig { @@ -210,8 +209,6 @@ pub struct RunConfig { pub local_analysis: bool, /// Shared identifier used to group dry-run artifacts pub dry_run_id: Option, - /// Website-created run identifier for uploaded results - pub run_id: Option, /// Website-provided route list used instead of static default_model_routes() pub route_overrides: Option>, } diff --git a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs index 219a770b502..0d6d1f7374f 100644 --- a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs +++ b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs @@ -82,9 +82,6 @@ enum Commands { /// Run benchmarks / build goldens / compute hashes. Run(RunArgs), - /// Run a website-created benchmark spec by id. - RunFromApi(RunFromApiArgs), - /// Run AI analysis on existing benchmark failures from the database. Analyze(AnalyzeArgs), } @@ -147,16 +144,6 @@ struct RunArgs { #[arg(skip)] route_overrides: Option>, - - #[arg(skip)] - run_id: Option, -} - -#[derive(Args, Debug, Clone)] -struct RunFromApiArgs { - /// Website-created llm_benchmark_runs id - #[arg(long)] - run_id: String, } #[derive(Args, Debug, Clone)] @@ -228,7 +215,6 @@ fn main() -> Result<()> { match cli.command { Commands::Run(args) => cmd_run(args), - Commands::RunFromApi(args) => cmd_run_from_api(args), Commands::Analyze(args) => cmd_analyze(args), } } @@ -240,57 +226,6 @@ fn cmd_run(args: RunArgs) -> Result<()> { Ok(()) } -fn cmd_run_from_api(args: RunFromApiArgs) -> Result<()> { - let api = ApiClient::from_env() - .context("failed to initialize API client")? - .context("LLM_BENCHMARK_UPLOAD_URL required for run-from-api")?; - if let Err(e) = api.update_run_status(&args.run_id, "running", None) { - eprintln!("[warn] failed to mark website benchmark run as running: {e:#}"); - } - - let result = cmd_run_from_api_inner(&api, &args.run_id); - match result { - Ok(()) => { - if let Err(e) = api.update_run_status(&args.run_id, "succeeded", None) { - eprintln!("[warn] failed to mark website benchmark run as succeeded: {e:#}"); - } - Ok(()) - } - Err(e) => { - let message = format!("{e:#}"); - if let Err(status_err) = api.update_run_status(&args.run_id, "failed", Some(&message)) { - eprintln!("[warn] failed to mark website benchmark run as failed: {status_err:#}"); - } - Err(e) - } - } -} - -fn cmd_run_from_api_inner(api: &ApiClient, run_id: &str) -> Result<()> { - let spec = api.fetch_run_spec(run_id)?; - - for lang in &spec.languages { - run_benchmarks(RunArgs { - modes: Some(spec.modes.clone()), - lang: *lang, - hash_only: false, - goldens_only: false, - force: false, - categories: spec.categories.clone(), - tasks: spec.tasks.clone(), - providers: None, - models: None, - model_source: ModelSource::Static, - dry_run: false, - local_analysis: false, - route_overrides: Some(spec.routes.clone()), - run_id: Some(spec.run_id.clone()), - })?; - } - - Ok(()) -} - /// Core benchmark runner used by both `run` and `ci-quickfix` fn run_benchmarks(args: RunArgs) -> Result<()> { let dry_run = args.dry_run; @@ -323,7 +258,6 @@ fn run_benchmarks(args: RunArgs) -> Result<()> { dry_run, local_analysis, dry_run_id: dry_run_id.clone(), - run_id: args.run_id, route_overrides: args.route_overrides, }; @@ -624,7 +558,6 @@ fn short_hash(s: &str) -> &str { fn should_fetch_remote_routes(args: &RunArgs) -> bool { args.model_source == ModelSource::Remote - && args.models.is_none() && args.route_overrides.is_none() && !args.dry_run && !args.hash_only @@ -782,6 +715,7 @@ fn filter_routes(config: &RunConfig) -> Vec { let already_matched = routes.iter().any(|r| { r.vendor == *vendor && (r.api_model == model_id.as_str() + || r.display_name.to_ascii_lowercase() == model_id.as_str() || r.openrouter_model.as_deref() == Some(model_id.as_str())) }); if !already_matched { @@ -812,13 +746,11 @@ async fn run_many_routes_for_mode( let dry_run = config.dry_run; let local_analysis = config.local_analysis; let dry_run_id = config.dry_run_id.clone(); - let run_id = config.run_id.clone(); futures::stream::iter(routes.iter().map(|route| { let host = host.clone(); let api_client = api_client.clone(); let dry_run_id = dry_run_id.clone(); - let run_id = run_id.clone(); async move { println!("\u{2192} running {}", route.display_name); @@ -837,7 +769,6 @@ async fn run_many_routes_for_mode( dry_run, local_analysis, dry_run_id, - run_id, }; let outcomes = run_selected_or_all_for_model_async_for_lang(&per).await?; @@ -1015,7 +946,6 @@ mod tests { dry_run: false, local_analysis: false, route_overrides: None, - run_id: None, } } @@ -1035,13 +965,12 @@ mod tests { dry_run: false, local_analysis: false, dry_run_id: None, - run_id: None, route_overrides, } } #[test] - fn remote_model_source_fetches_only_for_implicit_models() { + fn remote_model_source_fetches_even_for_explicit_models() { let mut args = base_run_args(); args.model_source = ModelSource::Remote; assert!(should_fetch_remote_routes(&args)); @@ -1050,7 +979,7 @@ mod tests { vendor: Vendor::OpenAi, models: vec!["gpt-test".to_string()], }]); - assert!(!should_fetch_remote_routes(&args)); + assert!(should_fetch_remote_routes(&args)); } #[test] @@ -1069,6 +998,27 @@ mod tests { assert_eq!(routes[0].api_model, "openai/remote-model"); } + #[test] + fn filter_routes_does_not_synthesize_duplicate_for_display_name_match() { + let remote_route = ModelRoute::new( + "DeepSeek V4 Flash", + Vendor::DeepSeek, + "deepseek-v4-flash", + Some("deepseek/deepseek-v4-flash"), + ); + let mut config = base_config(Some(vec![remote_route])); + let mut allowed = HashSet::new(); + allowed.insert("deepseek v4 flash".to_string()); + let mut filter = HashMap::new(); + filter.insert(Vendor::DeepSeek, allowed); + config.model_filter = Some(filter); + + let routes = filter_routes(&config); + assert_eq!(routes.len(), 1); + assert_eq!(routes[0].display_name, "DeepSeek V4 Flash"); + assert_eq!(routes[0].api_model, "deepseek-v4-flash"); + } + #[test] fn category_filter_accepts_full_task_ids() { let root = std::env::temp_dir().join(format!( diff --git a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs index d8eba39c4d0..83454c2677c 100644 --- a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs +++ b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs @@ -82,8 +82,8 @@ impl LlmClient for OpenRouterClient { "OpenRouter" } - async fn preflight(&self, _model: &str) -> Result { - let status = self.preflight_credits().await?; + async fn preflight(&self, model: &str) -> Result { + let status = self.preflight_credits(model).await?; Ok(ClientPreflight::new(status.summary())) } diff --git a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs index 54e0532db34..61d6998728c 100644 --- a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs +++ b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs @@ -36,9 +36,11 @@ impl OpenRouterClient { Self { base, api_key, http } } - pub async fn preflight_credits(&self) -> Result { + pub async fn preflight_credits(&self, model: &str) -> Result { let key_info = self.fetch_key_info().await?; let min_credits = min_credits_threshold(); + let mut unchecked_allowed = false; + let mut model_probe = None; if let Some(remaining) = key_info.limit_remaining && remaining <= min_credits @@ -69,10 +71,12 @@ impl OpenRouterClient { } if account.is_none() && key_info.limit_remaining.is_none() { - bail!( - "OpenRouter API key has no configured credit limit and account credits were not checked. \ - Set OPENROUTER_MANAGEMENT_API_KEY for account balance preflight." - ); + if allow_unchecked_credits() { + unchecked_allowed = true; + } else { + self.probe_model(model).await?; + model_probe = Some(model.to_string()); + } } Ok(OpenRouterCreditStatus { @@ -80,6 +84,8 @@ impl OpenRouterClient { key_limit_remaining: key_info.limit_remaining, account_remaining: account.map(|a| a.remaining), min_credits, + model_probe, + unchecked_allowed, }) } @@ -111,6 +117,51 @@ impl OpenRouterClient { }) } + async fn probe_model(&self, model: &str) -> Result<()> { + let url = format!("{}/chat/completions", self.base.trim_end_matches('/')); + + #[derive(Serialize)] + struct Req<'a> { + model: &'a str, + messages: [Msg<'a>; 1], + temperature: f32, + max_tokens: u32, + } + + #[derive(Serialize)] + struct Msg<'a> { + role: &'a str, + content: &'a str, + } + + let req = Req { + model, + messages: [Msg { + role: "user", + content: "ping", + }], + temperature: 0.0, + max_tokens: 1, + }; + let auth = HttpClient::bearer(&self.api_key); + let body = self + .http + .post_json(&url, &[auth], &req) + .await + .with_context(|| format!("OpenRouter model probe failed for '{model}'"))?; + + let resp: serde_json::Value = serde_json::from_str(&body).context("parse OpenRouter probe response")?; + if let Some(err) = resp.get("error") { + let message = err + .get("message") + .and_then(|message| message.as_str()) + .unwrap_or("unknown OpenRouter probe error"); + bail!("OpenRouter model probe failed for '{}': {}", model, message); + } + + Ok(()) + } + pub async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result { let url = format!("{}/chat/completions", self.base.trim_end_matches('/')); @@ -207,6 +258,8 @@ pub struct OpenRouterCreditStatus { pub key_limit_remaining: Option, pub account_remaining: Option, pub min_credits: f64, + pub model_probe: Option, + pub unchecked_allowed: bool, } impl OpenRouterCreditStatus { @@ -218,7 +271,7 @@ impl OpenRouterCreditStatus { (None, None) => "key has no configured limit".to_string(), }; - match self.account_remaining { + let credit_status = match self.account_remaining { Some(remaining) => { format!( "{key_remaining}; account remaining {remaining:.4}; min {:.4}", @@ -229,6 +282,14 @@ impl OpenRouterCreditStatus { "{key_remaining}; account balance not checked (set OPENROUTER_MANAGEMENT_API_KEY); min {:.4}", self.min_credits ), + }; + + if let Some(model) = &self.model_probe { + format!("{credit_status}; model probe OK for '{model}'") + } else if self.unchecked_allowed { + format!("{credit_status}; unchecked credits allowed by OPENROUTER_ALLOW_UNCHECKED_CREDITS") + } else { + credit_status } } } @@ -261,12 +322,30 @@ struct OpenRouterAccountCredits { } fn min_credits_threshold() -> f64 { - env::var("LLM_MIN_CREDITS") - .ok() - .and_then(|v| v.trim().parse::().ok()) + let openrouter = env::var("OPENROUTER_MIN_CREDITS").ok(); + let global = env::var("LLM_MIN_CREDITS").ok(); + parse_min_credits_threshold(openrouter.as_deref(), global.as_deref()) +} + +fn allow_unchecked_credits() -> bool { + let value = env::var("OPENROUTER_ALLOW_UNCHECKED_CREDITS").ok(); + parse_env_flag(value.as_deref()) +} + +fn parse_min_credits_threshold(openrouter: Option<&str>, global: Option<&str>) -> f64 { + [openrouter, global] + .into_iter() + .flatten() + .find_map(|v| v.trim().parse::().ok()) .unwrap_or(0.0) } +fn parse_env_flag(value: Option<&str>) -> bool { + value + .map(|v| matches!(v.trim().to_ascii_lowercase().as_str(), "1" | "true" | "yes" | "y")) + .unwrap_or(false) +} + /// Context limits for models accessed via OpenRouter. /// Uses the same limits as direct clients where known, /// falls back to a conservative default. @@ -333,3 +412,26 @@ pub fn openrouter_ctx_limit_tokens(model: &str) -> usize { DEFAULT_CTX_LIMIT } + +#[cfg(test)] +mod tests { + use super::{parse_env_flag, parse_min_credits_threshold}; + + #[test] + fn openrouter_min_credits_overrides_global_threshold() { + assert_eq!(parse_min_credits_threshold(Some("2.5"), Some("1.0")), 2.5); + assert_eq!(parse_min_credits_threshold(None, Some("1.0")), 1.0); + assert_eq!(parse_min_credits_threshold(Some("not-a-number"), Some("1.0")), 1.0); + assert_eq!(parse_min_credits_threshold(None, None), 0.0); + } + + #[test] + fn unchecked_credit_escape_hatch_accepts_common_true_values() { + for value in ["1", "true", "TRUE", " yes ", "y"] { + assert!(parse_env_flag(Some(value))); + } + for value in [None, Some(""), Some("0"), Some("false"), Some("no")] { + assert!(!parse_env_flag(value)); + } + } +} From f2179a25897a102da705371e720e7a78e90da1bf Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 10:25:20 -0400 Subject: [PATCH 05/25] weekly goldens; workflow refinements --- .github/workflows/llm-benchmark-periodic.yml | 20 ++++++++++----- .../llm-benchmark-validate-goldens.yml | 24 ++++++++++++------ .../src/bin/llm_benchmark.rs | 25 ++++++++++--------- 3 files changed, 44 insertions(+), 25 deletions(-) diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml index 183ba1c0ea9..cf4b57976f8 100644 --- a/.github/workflows/llm-benchmark-periodic.yml +++ b/.github/workflows/llm-benchmark-periodic.yml @@ -7,7 +7,7 @@ on: workflow_dispatch: inputs: models: - description: 'Models to run (provider:model format, comma-separated, or "all")' + description: 'Models to run ("all", or space-separated provider:model groups; each group may contain comma-separated models)' required: false default: 'all' languages: @@ -26,12 +26,16 @@ on: description: 'Optional benchmark task ids/selectors to run (comma-separated)' required: false default: '' + dry_run: + description: 'Run benchmarks without uploading results' + required: false + default: 'false' permissions: contents: read concurrency: - group: llm-benchmark-periodic + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} cancel-in-progress: true jobs: @@ -40,10 +44,9 @@ jobs: timeout-minutes: 180 steps: - - name: Checkout master + - name: Checkout repository uses: actions/checkout@v4 with: - ref: master fetch-depth: 1 - uses: dtolnay/rust-toolchain@stable @@ -101,12 +104,14 @@ jobs: INPUT_MODES: ${{ inputs.modes || 'guidelines,no_context' }} INPUT_CATEGORIES: ${{ inputs.categories || '' }} INPUT_TASKS: ${{ inputs.tasks || '' }} + INPUT_DRY_RUN: ${{ inputs.dry_run || 'false' }} run: | LANGS="$INPUT_LANGUAGES" MODELS="$INPUT_MODELS" MODES="$INPUT_MODES" CATEGORIES="$INPUT_CATEGORIES" TASKS="$INPUT_TASKS" + DRY_RUN="$INPUT_DRY_RUN" SUCCEEDED=0 FAILED=0 @@ -118,6 +123,9 @@ jobs: if [ -n "$TASKS" ]; then EXTRA_ARGS+=(--tasks "$TASKS") fi + if [ "$DRY_RUN" = "true" ]; then + EXTRA_ARGS+=(--dry-run) + fi if [ "$MODELS" = "all" ]; then if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote "${EXTRA_ARGS[@]}"; then @@ -137,7 +145,7 @@ jobs: fi done echo "Benchmark runs: $SUCCEEDED succeeded, $FAILED failed" - if [ "$SUCCEEDED" -eq 0 ] && [ "$FAILED" -gt 0 ]; then - echo "::error::All benchmark runs failed" + if [ "$FAILED" -gt 0 ]; then + echo "::error::$FAILED benchmark run(s) failed" exit 1 fi diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml index 591d55a6a59..cb4c532833e 100644 --- a/.github/workflows/llm-benchmark-validate-goldens.yml +++ b/.github/workflows/llm-benchmark-validate-goldens.yml @@ -2,15 +2,26 @@ name: Validate LLM benchmark golden answers on: schedule: - # Nightly at 2 AM UTC - - cron: '0 2 * * *' - workflow_dispatch: {} + # Weekly on Monday at 2 AM UTC. + - cron: '0 2 * * 1' + workflow_dispatch: + inputs: + lang: + description: 'Language to validate for manual smoke runs' + required: false + type: choice + default: all + options: + - all + - rust + - csharp + - typescript permissions: contents: read concurrency: - group: llm-benchmark-validate-goldens + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} cancel-in-progress: true jobs: @@ -21,13 +32,12 @@ jobs: strategy: fail-fast: false matrix: - lang: [rust, csharp, typescript] + lang: ${{ fromJSON(github.event_name == 'workflow_dispatch' && inputs.lang != 'all' && format('["{0}"]', inputs.lang) || '["rust","csharp","typescript"]') }} steps: - - name: Checkout master + - name: Checkout repository uses: actions/checkout@v4 with: - ref: master fetch-depth: 1 - uses: dtolnay/rust-toolchain@stable diff --git a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs index 0d6d1f7374f..72b551a9540 100644 --- a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs +++ b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs @@ -233,13 +233,15 @@ fn run_benchmarks(args: RunArgs) -> Result<()> { let dry_run_id = dry_run.then(|| chrono::Utc::now().format("%Y-%m-%d_%H%M%S").to_string()); let should_fetch_remote_routes = should_fetch_remote_routes(&args); - let api_client = if dry_run { - None - } else { + let needs_api_client = should_fetch_remote_routes || !dry_run; + let api_client = if needs_api_client { ApiClient::from_env().context("failed to initialize API client")? + } else { + None }; + let upload_client = if dry_run { None } else { api_client.clone() }; - if api_client.is_none() && !dry_run { + if upload_client.is_none() && !dry_run { eprintln!("[warn] LLM_BENCHMARK_UPLOAD_URL not set; results will not be uploaded"); } @@ -254,7 +256,7 @@ fn run_benchmarks(args: RunArgs) -> Result<()> { categories: categories_to_set(args.categories), model_filter: model_filter_from_groups(args.models), host: None, - api_client: api_client.clone(), + api_client: upload_client.clone(), dry_run, local_analysis, dry_run_id: dry_run_id.clone(), @@ -271,7 +273,7 @@ fn run_benchmarks(args: RunArgs) -> Result<()> { let bench_root = find_bench_root(); // Upload task catalog before running benchmarks - if let Some(ref api) = api_client + if let Some(ref api) = upload_client && let Err(e) = api.upload_task_catalog(&bench_root) { eprintln!("[warn] failed to upload task catalog: {e}"); @@ -557,11 +559,7 @@ fn short_hash(s: &str) -> &str { } fn should_fetch_remote_routes(args: &RunArgs) -> bool { - args.model_source == ModelSource::Remote - && args.route_overrides.is_none() - && !args.dry_run - && !args.hash_only - && !args.goldens_only + args.model_source == ModelSource::Remote && args.route_overrides.is_none() && !args.hash_only && !args.goldens_only } fn preflight_llm_routes( @@ -970,7 +968,7 @@ mod tests { } #[test] - fn remote_model_source_fetches_even_for_explicit_models() { + fn remote_model_source_fetches_for_all_model_selection_paths() { let mut args = base_run_args(); args.model_source = ModelSource::Remote; assert!(should_fetch_remote_routes(&args)); @@ -980,6 +978,9 @@ mod tests { models: vec!["gpt-test".to_string()], }]); assert!(should_fetch_remote_routes(&args)); + + args.dry_run = true; + assert!(should_fetch_remote_routes(&args)); } #[test] From 8d1d27ec4c9e20e678b02eda4a6f7f25f68e0484 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 11:13:30 -0400 Subject: [PATCH 06/25] Update publishers.rs --- .../src/bench/publishers.rs | 251 +++++++++++++----- 1 file changed, 190 insertions(+), 61 deletions(-) diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs index 55b8a98d5b5..7ba383dbc06 100644 --- a/tools/xtask-llm-benchmark/src/bench/publishers.rs +++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs @@ -31,6 +31,121 @@ fn pnpm_minimum_release_age() -> Result { .ok_or_else(|| anyhow::anyhow!("pnpm-workspace.yaml is missing minimumReleaseAge")) } +fn path_entries() -> Vec { + #[cfg(windows)] + let path = env::var_os("Path").or_else(|| env::var_os("PATH")); + #[cfg(not(windows))] + let path = env::var_os("PATH"); + + path.map(|path| env::split_paths(&path).collect()).unwrap_or_default() +} + +fn command_path_candidates(name: &str) -> Vec { + #[cfg(windows)] + { + let path = Path::new(name); + if path.extension().is_some() { + vec![name.to_string()] + } else { + vec![ + format!("{name}.cmd"), + format!("{name}.exe"), + format!("{name}.bat"), + name.to_string(), + ] + } + } + #[cfg(not(windows))] + { + vec![name.to_string()] + } +} + +fn resolve_command_on_path(name: &str) -> Option { + for dir in path_entries() { + for candidate in command_path_candidates(name) { + let path = dir.join(candidate); + if path.is_file() { + return Some(path); + } + } + } + None +} + +fn configured_nodejs_dir() -> Option { + env::var("NODEJS_DIR") + .ok() + .map(|s| s.trim().trim_matches('"').trim().to_string()) + .filter(|s| !s.is_empty()) + .map(PathBuf::from) +} + +fn pnpm_in_dir(dir: &Path) -> Option { + #[cfg(windows)] + { + for candidate in ["pnpm.cmd", "pnpm.exe", "pnpm.bat"] { + let path = dir.join(candidate); + if path.is_file() { + return Some(path); + } + } + None + } + #[cfg(not(windows))] + { + let path = dir.join("pnpm"); + path.is_file().then_some(path) + } +} + +fn node_in_dir(dir: &Path) -> Option { + #[cfg(windows)] + let path = dir.join("node.exe"); + #[cfg(not(windows))] + let path = dir.join("node"); + + path.is_file().then_some(path) +} + +fn resolve_node_exe(nodejs_dir: Option<&Path>) -> Option { + nodejs_dir + .and_then(node_in_dir) + .or_else(|| resolve_command_on_path("node")) + .or_else(|| { + env::var("NVM_SYMLINK") + .ok() + .map(PathBuf::from) + .and_then(|dir| node_in_dir(&dir)) + }) +} + +fn pnpm_cjs_for_cmd(pnpm: &Path) -> Option { + #[cfg(windows)] + { + let is_cmd = pnpm + .extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| ext.eq_ignore_ascii_case("cmd")); + if !is_cmd { + return None; + } + + let cjs = pnpm + .parent()? + .join("node_modules") + .join("pnpm") + .join("bin") + .join("pnpm.cjs"); + cjs.is_file().then_some(cjs) + } + #[cfg(not(windows))] + { + let _ = pnpm; + None + } +} + /// Strip ANSI escape codes (color codes) from a string fn strip_ansi_codes(s: &str) -> Cow<'_, str> { static ANSI_RE: LazyLock = LazyLock::new(|| { @@ -275,49 +390,31 @@ impl Publisher for TypeScriptPublisher { let db = sanitize_db_name(module_name); // Install dependencies (--ignore-workspace to avoid parent workspace interference). - // If NODEJS_DIR is set (e.g. nvm4w on Windows), use full path to pnpm so spawn finds it. - let pnpm_exe = env::var("NODEJS_DIR") - .ok() - .map(|s| s.trim().trim_matches('"').trim().to_string()) - .filter(|s| !s.is_empty()) - .map(PathBuf::from) - .and_then(|dir| { - #[cfg(windows)] - { - let pnpm_cmd = dir.join("pnpm.cmd"); - let pnpm_exe_path = dir.join("pnpm.exe"); - if pnpm_cmd.is_file() { - eprintln!("[pnpm] using NODEJS_DIR: {} (pnpm.cmd)", dir.display()); - Some(pnpm_cmd) - } else if pnpm_exe_path.is_file() { - eprintln!("[pnpm] using NODEJS_DIR: {} (pnpm.exe)", dir.display()); - Some(pnpm_exe_path) - } else { - eprintln!( - "[pnpm] NODEJS_DIR set to {} but pnpm.cmd/pnpm.exe not found there, using PATH", - dir.display() - ); - None - } - } - #[cfg(not(windows))] - { - let pnpm = dir.join("pnpm"); - if pnpm.is_file() { - eprintln!("[pnpm] using NODEJS_DIR: {} (pnpm)", dir.display()); - Some(pnpm) - } else { - eprintln!( - "[pnpm] NODEJS_DIR set to {} but pnpm not found there, using PATH", - dir.display() - ); - None - } - } - }); - let mut pnpm_cmd = match &pnpm_exe { - Some(p) => Command::new(p), - None => Command::new("pnpm"), + let nodejs_dir = configured_nodejs_dir(); + let pnpm_exe = nodejs_dir + .as_deref() + .and_then(pnpm_in_dir) + .or_else(|| resolve_command_on_path("pnpm")); + if let Some(ref pnpm) = pnpm_exe { + eprintln!("[pnpm] using {}", pnpm.display()); + } else if let Some(ref dir) = nodejs_dir { + eprintln!( + "[pnpm] NODEJS_DIR set to {} but pnpm not found there or on PATH", + dir.display() + ); + } + let node_exe = resolve_node_exe(nodejs_dir.as_deref()); + let pnpm_cjs = pnpm_exe.as_deref().and_then(pnpm_cjs_for_cmd); + let mut pnpm_cmd = if let (Some(node), Some(cjs)) = (&node_exe, pnpm_cjs) { + eprintln!("[pnpm] invoking {} {}", node.display(), cjs.display()); + let mut cmd = Command::new(node); + cmd.arg(cjs); + cmd + } else { + match &pnpm_exe { + Some(p) => Command::new(p), + None => Command::new("pnpm"), + } }; pnpm_cmd .arg("install") @@ -327,30 +424,62 @@ impl Publisher for TypeScriptPublisher { // This install runs in a materialized project with workspace config // ignored, so pass the repo's pnpm package-age policy explicitly. .env("npm_config_minimum_release_age", pnpm_minimum_release_age()?); - // When using NODEJS_DIR, prepend it to PATH so pnpm.cmd can find node. - if let Some(ref dir) = pnpm_exe - && let Some(parent) = dir.parent() + let mut prepend_paths = Vec::new(); + if let Some(dir) = nodejs_dir { + prepend_paths.push(dir); + } + if let Some(ref pnpm) = pnpm_exe + && let Some(parent) = pnpm.parent() { - let mut paths: Vec = env::split_paths(&env::var("PATH").unwrap_or_default()).collect(); - paths.insert(0, parent.to_path_buf()); - if let Ok(new_path) = env::join_paths(paths) { - pnpm_cmd.env("PATH", new_path); + prepend_paths.push(parent.to_path_buf()); + } + if let Some(node) = node_exe + && let Some(parent) = node.parent() + { + prepend_paths.push(parent.to_path_buf()); + } + let child_path = if !prepend_paths.is_empty() { + let mut paths = path_entries(); + for path in prepend_paths.into_iter().rev() { + if !paths.iter().any(|existing| existing == &path) { + paths.insert(0, path); + } + } + env::join_paths(paths).ok() + } else { + None + }; + if let Some(ref new_path) = child_path { + #[cfg(windows)] + { + pnpm_cmd.env_remove("PATH"); + pnpm_cmd.env("Path", new_path); } + #[cfg(not(windows))] + pnpm_cmd.env("PATH", new_path); } run(&mut pnpm_cmd, "pnpm install (typescript)")?; // Publish (spacetime CLI handles TypeScript compilation internally) - run( - Command::new("spacetime") - .arg("publish") - .arg("-c") - .arg("-y") - .arg("--server") - .arg(host_url) - .arg(&db) - .current_dir(source), - "spacetime publish (typescript)", - )?; + let mut publish_cmd = Command::new("spacetime"); + publish_cmd + .arg("publish") + .arg("-c") + .arg("-y") + .arg("--server") + .arg(host_url) + .arg(&db) + .current_dir(source); + if let Some(ref new_path) = child_path { + #[cfg(windows)] + { + publish_cmd.env_remove("PATH"); + publish_cmd.env("Path", new_path); + } + #[cfg(not(windows))] + publish_cmd.env("PATH", new_path); + } + run(&mut publish_cmd, "spacetime publish (typescript)")?; Ok(()) } From d5957f26603d5c45eb874b991883b86af2d31d91 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 13:12:46 -0400 Subject: [PATCH 07/25] golden fixes --- .../queries/t_037_multi_column_filter/answers/typescript.ts | 2 +- .../benchmarks/schema/t_018_constraints/answers/typescript.ts | 2 +- .../schema/t_019_many_to_many/answers/typescript.ts | 4 ++-- .../schema/t_021_multi_column_index/answers/typescript.ts | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts index 26c7dc9b230..1ba8ca175d1 100644 --- a/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts +++ b/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts @@ -2,7 +2,7 @@ import { schema, table, t } from 'spacetimedb/server'; const eventLog = table({ name: 'event_log', - indexes: [{ name: 'byCategorySeverity', algorithm: 'btree', columns: ['category', 'severity'] }], + indexes: [{ accessor: 'byCategorySeverity', algorithm: 'btree', columns: ['category', 'severity'] }], }, { id: t.u64().primaryKey().autoInc(), category: t.string(), diff --git a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts index 50d9f9c1dae..d23dead5a96 100644 --- a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts +++ b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts @@ -2,7 +2,7 @@ import { table, schema, t } from 'spacetimedb/server'; const account = table({ name: 'account', - indexes: [{ name: 'byName', algorithm: 'btree', columns: ['name'] }], + indexes: [{ accessor: 'byName', algorithm: 'btree', columns: ['name'] }], }, { id: t.u64().primaryKey().autoInc(), email: t.string().unique(), diff --git a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts index d7629137dcc..4ab152504d1 100644 --- a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts +++ b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts @@ -24,8 +24,8 @@ const membership = table( { name: 'membership', indexes: [ - { name: 'byUser', algorithm: 'btree', columns: ['userId'] }, - { name: 'byGroup', algorithm: 'btree', columns: ['groupId'] }, + { accessor: 'byUser', algorithm: 'btree', columns: ['userId'] }, + { accessor: 'byGroup', algorithm: 'btree', columns: ['groupId'] }, ], }, { diff --git a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts index 5d5fb568d7b..2f237fb0151 100644 --- a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts +++ b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts @@ -2,7 +2,7 @@ import { table, schema, t } from 'spacetimedb/server'; const log = table({ name: 'log', - indexes: [{ name: 'byUserDay', algorithm: 'btree', columns: ['userId', 'day'] }], + indexes: [{ accessor: 'byUserDay', algorithm: 'btree', columns: ['userId', 'day'] }], }, { id: t.u64().primaryKey().autoInc(), userId: t.i32(), From 4c679e2ea9827e4fd3c702e7d7bd7c64b0111981 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 14:58:54 -0400 Subject: [PATCH 08/25] fixes --- .github/workflows/llm-benchmark-periodic.yml | 55 +++++++++++++++++-- .../src/bin/llm_benchmark.rs | 12 ++-- .../src/llm/clients/openrouter.rs | 2 +- 3 files changed, 58 insertions(+), 11 deletions(-) diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml index cf4b57976f8..6673b541cde 100644 --- a/.github/workflows/llm-benchmark-periodic.yml +++ b/.github/workflows/llm-benchmark-periodic.yml @@ -6,10 +6,19 @@ on: - cron: '0 0 * * 1' workflow_dispatch: inputs: + model_set: + description: 'Model set to run' + required: false + type: choice + options: + - website_active + - local_defaults + - explicit + default: website_active models: - description: 'Models to run ("all", or space-separated provider:model groups; each group may contain comma-separated models)' + description: 'Space-separated provider:model groups. Required when model_set=explicit.' required: false - default: 'all' + default: '' languages: description: 'Languages to benchmark (comma-separated: rust,csharp,typescript)' required: false @@ -100,19 +109,47 @@ jobs: DOTNET_CLI_USE_MSBUILD_SERVER: "0" LLM_BENCH_CSHARP_CONCURRENCY: "1" INPUT_LANGUAGES: ${{ inputs.languages || 'rust,csharp,typescript' }} - INPUT_MODELS: ${{ inputs.models || 'all' }} + INPUT_MODEL_SET: ${{ inputs.model_set || 'website_active' }} + INPUT_MODELS: ${{ inputs.models || '' }} INPUT_MODES: ${{ inputs.modes || 'guidelines,no_context' }} INPUT_CATEGORIES: ${{ inputs.categories || '' }} INPUT_TASKS: ${{ inputs.tasks || '' }} INPUT_DRY_RUN: ${{ inputs.dry_run || 'false' }} run: | LANGS="$INPUT_LANGUAGES" + MODEL_SET="$INPUT_MODEL_SET" MODELS="$INPUT_MODELS" MODES="$INPUT_MODES" CATEGORIES="$INPUT_CATEGORIES" TASKS="$INPUT_TASKS" DRY_RUN="$INPUT_DRY_RUN" + case "$MODEL_SET" in + website_active) + if [ -n "$MODELS" ]; then + echo "::error::models is only valid when model_set=explicit" + exit 1 + fi + ;; + local_defaults) + if [ -n "$MODELS" ]; then + echo "::error::models is only valid when model_set=explicit" + exit 1 + fi + ;; + explicit) + if [ -z "$MODELS" ]; then + echo "::error::models is required when model_set=explicit" + exit 1 + fi + read -r -a MODEL_ARGS <<< "$MODELS" + ;; + *) + echo "::error::unknown model_set '$MODEL_SET' (expected website_active, local_defaults, or explicit)" + exit 1 + ;; + esac + SUCCEEDED=0 FAILED=0 for LANG in $(echo "$LANGS" | tr ',' ' '); do @@ -127,16 +164,22 @@ jobs: EXTRA_ARGS+=(--dry-run) fi - if [ "$MODELS" = "all" ]; then + if [ "$MODEL_SET" = "website_active" ]; then if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote "${EXTRA_ARGS[@]}"; then SUCCEEDED=$((SUCCEEDED + 1)) else echo "::warning::Benchmark run failed for lang=$LANG" FAILED=$((FAILED + 1)) fi + elif [ "$MODEL_SET" = "local_defaults" ]; then + if llm_benchmark run --lang "$LANG" --modes "$MODES" "${EXTRA_ARGS[@]}"; then + SUCCEEDED=$((SUCCEEDED + 1)) + else + echo "::warning::Benchmark run failed for lang=$LANG" + FAILED=$((FAILED + 1)) + fi else - read -r -a MODEL_ARGS <<< "$MODELS" - if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote --models "${MODEL_ARGS[@]}" "${EXTRA_ARGS[@]}"; then + if llm_benchmark run --lang "$LANG" --modes "$MODES" --models "${MODEL_ARGS[@]}" "${EXTRA_ARGS[@]}"; then SUCCEEDED=$((SUCCEEDED + 1)) else echo "::warning::Benchmark run failed for lang=$LANG models=$MODELS" diff --git a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs index 72b551a9540..179be601634 100644 --- a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs +++ b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs @@ -559,7 +559,11 @@ fn short_hash(s: &str) -> &str { } fn should_fetch_remote_routes(args: &RunArgs) -> bool { - args.model_source == ModelSource::Remote && args.route_overrides.is_none() && !args.hash_only && !args.goldens_only + args.model_source == ModelSource::Remote + && args.models.is_none() + && args.route_overrides.is_none() + && !args.hash_only + && !args.goldens_only } fn preflight_llm_routes( @@ -968,7 +972,7 @@ mod tests { } #[test] - fn remote_model_source_fetches_for_all_model_selection_paths() { + fn explicit_models_bypass_remote_model_source() { let mut args = base_run_args(); args.model_source = ModelSource::Remote; assert!(should_fetch_remote_routes(&args)); @@ -977,10 +981,10 @@ mod tests { vendor: Vendor::OpenAi, models: vec!["gpt-test".to_string()], }]); - assert!(should_fetch_remote_routes(&args)); + assert!(!should_fetch_remote_routes(&args)); args.dry_run = true; - assert!(should_fetch_remote_routes(&args)); + assert!(!should_fetch_remote_routes(&args)); } #[test] diff --git a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs index 61d6998728c..8e8642ada0b 100644 --- a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs +++ b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs @@ -141,7 +141,7 @@ impl OpenRouterClient { content: "ping", }], temperature: 0.0, - max_tokens: 1, + max_tokens: 16, }; let auth = HttpClient::bearer(&self.api_key); let body = self From 4358ed59a83df45326c2593e8ae0bc0c235b7f0e Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:22:57 -0400 Subject: [PATCH 09/25] Update publishers.rs --- .../src/bench/publishers.rs | 134 ++++++++++++++---- 1 file changed, 109 insertions(+), 25 deletions(-) diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs index 7ba383dbc06..f82d617ae5f 100644 --- a/tools/xtask-llm-benchmark/src/bench/publishers.rs +++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs @@ -6,7 +6,11 @@ use std::env; use std::fs; use std::path::{Path, PathBuf}; use std::process::Command; -use std::sync::LazyLock; +use std::sync::{ + atomic::{AtomicU64, Ordering}, + LazyLock, +}; +use std::time::{SystemTime, UNIX_EPOCH}; fn workspace_root() -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")) @@ -120,6 +124,48 @@ fn resolve_node_exe(nodejs_dir: Option<&Path>) -> Option { }) } +struct CliRootDir { + path: PathBuf, +} + +impl CliRootDir { + fn path(&self) -> &Path { + &self.path + } +} + +impl Drop for CliRootDir { + fn drop(&mut self) { + let _ = fs::remove_dir_all(&self.path); + } +} + +fn isolated_cli_root() -> Result { + static COUNTER: AtomicU64 = AtomicU64::new(0); + + for _ in 0..16 { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|duration| duration.as_nanos()) + .unwrap_or(0); + let id = COUNTER.fetch_add(1, Ordering::Relaxed); + let path = env::temp_dir().join(format!("stdb-llm-cli-{}-{nanos}-{id}", std::process::id())); + match fs::create_dir(&path) { + Ok(()) => return Ok(CliRootDir { path }), + Err(error) if error.kind() == std::io::ErrorKind::AlreadyExists => continue, + Err(error) => return Err(error.into()), + } + } + + bail!("failed to create isolated SpacetimeDB CLI root directory"); +} + +fn spacetime_cmd(cli_root: &CliRootDir) -> Command { + let mut cmd = Command::new("spacetime"); + cmd.arg("--root-dir").arg(cli_root.path()); + cmd +} + fn pnpm_cjs_for_cmd(pnpm: &Path) -> Option { #[cfg(windows)] { @@ -279,6 +325,36 @@ impl DotnetPublisher { } Ok(()) } + + fn configure_dotnet_env(cmd: &mut Command) -> &mut Command { + cmd.env("DOTNET_CLI_TELEMETRY_OPTOUT", "1") + .env("DOTNET_NOLOGO", "1") + // Prevent MSBuild node reuse issues that cause "Pipe is broken" errors + // when running multiple dotnet builds in parallel. + .env("MSBUILDDISABLENODEREUSE", "1") + .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0") + } + + fn built_wasm(root: &Path, config_name: &str) -> Result { + let subdir = if env::var_os("EXPERIMENTAL_WASM_AOT").is_some_and(|value| value == "1") { + "publish" + } else { + "AppBundle" + }; + let candidates = [ + root.join(format!("bin/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")), + root.join(format!("bin~/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")), + ]; + + let mut found = candidates.iter().filter(|path| path.exists()); + let Some(path) = found.next() else { + bail!("dotnet publish succeeded but StdbModule.wasm was not found in bin or bin~"); + }; + if found.next().is_some() { + bail!("dotnet publish produced both bin and bin~ outputs; cannot choose the C# wasm"); + } + Ok(path.to_path_buf()) + } } impl Publisher for DotnetPublisher { @@ -288,30 +364,36 @@ impl Publisher for DotnetPublisher { } println!("publish csharp module {}", module_name); - Self::ensure_csproj(source)?; + let source = fs::canonicalize(source)?; + Self::ensure_csproj(&source)?; let db = sanitize_db_name(module_name); + let cli_root = isolated_cli_root()?; - let mut cmd = Command::new("spacetime"); - cmd.arg("build") - .current_dir(source) - .env("DOTNET_CLI_TELEMETRY_OPTOUT", "1") - .env("DOTNET_NOLOGO", "1") - // Prevent MSBuild node reuse issues that cause "Pipe is broken" errors - // when running multiple dotnet builds in parallel. - .env("MSBUILDDISABLENODEREUSE", "1") - .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0"); - run(&mut cmd, "spacetime build (csharp)")?; - - let mut pubcmd = Command::new("spacetime"); + let config_name = "Release"; + let mut build_cmd = Command::new("dotnet"); + build_cmd + .arg("publish") + .arg("-c") + .arg(config_name) + .arg("-v") + .arg("quiet") + .current_dir(&source); + Self::configure_dotnet_env(&mut build_cmd); + run(&mut build_cmd, "dotnet publish (csharp)")?; + let wasm = Self::built_wasm(&source, config_name)?; + + let mut pubcmd = spacetime_cmd(&cli_root); pubcmd .arg("publish") .arg("-c") .arg("-y") .arg("--server") .arg(host_url) + .arg("--bin-path") + .arg(wasm) .arg(&db) - .current_dir(source); + .current_dir(&source); run(&mut pubcmd, "spacetime publish (csharp)")?; Ok(()) @@ -345,10 +427,11 @@ impl Publisher for SpacetimeRustPublisher { // sanitize db + server let db = sanitize_db_name(module_name); + let cli_root = isolated_cli_root()?; // 2) Publish run( - Command::new("spacetime") + spacetime_cmd(&cli_root) .arg("publish") .arg("-c") .arg("-y") @@ -388,6 +471,7 @@ impl Publisher for TypeScriptPublisher { Self::ensure_package_json(source)?; let db = sanitize_db_name(module_name); + let cli_root = isolated_cli_root()?; // Install dependencies (--ignore-workspace to avoid parent workspace interference). let nodejs_dir = configured_nodejs_dir(); @@ -428,15 +512,15 @@ impl Publisher for TypeScriptPublisher { if let Some(dir) = nodejs_dir { prepend_paths.push(dir); } - if let Some(ref pnpm) = pnpm_exe - && let Some(parent) = pnpm.parent() - { - prepend_paths.push(parent.to_path_buf()); + if let Some(ref pnpm) = pnpm_exe { + if let Some(parent) = pnpm.parent() { + prepend_paths.push(parent.to_path_buf()); + } } - if let Some(node) = node_exe - && let Some(parent) = node.parent() - { - prepend_paths.push(parent.to_path_buf()); + if let Some(node) = node_exe { + if let Some(parent) = node.parent() { + prepend_paths.push(parent.to_path_buf()); + } } let child_path = if !prepend_paths.is_empty() { let mut paths = path_entries(); @@ -461,7 +545,7 @@ impl Publisher for TypeScriptPublisher { run(&mut pnpm_cmd, "pnpm install (typescript)")?; // Publish (spacetime CLI handles TypeScript compilation internally) - let mut publish_cmd = Command::new("spacetime"); + let mut publish_cmd = spacetime_cmd(&cli_root); publish_cmd .arg("publish") .arg("-c") From 890be1885a21df20f0e4b6aef7b7a9ffd9c8146b Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:39:42 -0400 Subject: [PATCH 10/25] updates --- .github/workflows/llm-benchmark-periodic.yml | 3 +++ .github/workflows/llm-benchmark-validate-goldens.yml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml index 6673b541cde..290cfbee325 100644 --- a/.github/workflows/llm-benchmark-periodic.yml +++ b/.github/workflows/llm-benchmark-periodic.yml @@ -105,6 +105,9 @@ jobs: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} LLM_BENCHMARK_API_KEY: ${{ secrets.LLM_BENCHMARK_API_KEY }} LLM_BENCHMARK_UPLOAD_URL: ${{ secrets.LLM_BENCHMARK_UPLOAD_URL }} + DOTNET_MULTILEVEL_LOOKUP: "0" + DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home + DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1" MSBUILDDISABLENODEREUSE: "1" DOTNET_CLI_USE_MSBUILD_SERVER: "0" LLM_BENCH_CSHARP_CONCURRENCY: "1" diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml index cb4c532833e..a5199cb0bfe 100644 --- a/.github/workflows/llm-benchmark-validate-goldens.yml +++ b/.github/workflows/llm-benchmark-validate-goldens.yml @@ -87,6 +87,9 @@ jobs: - name: Validate golden answers (${{ matrix.lang }}) env: + DOTNET_MULTILEVEL_LOOKUP: "0" + DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home + DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1" MSBUILDDISABLENODEREUSE: "1" DOTNET_CLI_USE_MSBUILD_SERVER: "0" LLM_BENCH_CSHARP_CONCURRENCY: "1" From 480cedf0ba60839177cc2d17fd530f9d08b58354 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:44:08 -0400 Subject: [PATCH 11/25] Update publishers.rs --- .../src/bench/publishers.rs | 45 +++---------------- 1 file changed, 7 insertions(+), 38 deletions(-) diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs index f82d617ae5f..18d07dfa42b 100644 --- a/tools/xtask-llm-benchmark/src/bench/publishers.rs +++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs @@ -334,27 +334,6 @@ impl DotnetPublisher { .env("MSBUILDDISABLENODEREUSE", "1") .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0") } - - fn built_wasm(root: &Path, config_name: &str) -> Result { - let subdir = if env::var_os("EXPERIMENTAL_WASM_AOT").is_some_and(|value| value == "1") { - "publish" - } else { - "AppBundle" - }; - let candidates = [ - root.join(format!("bin/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")), - root.join(format!("bin~/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")), - ]; - - let mut found = candidates.iter().filter(|path| path.exists()); - let Some(path) = found.next() else { - bail!("dotnet publish succeeded but StdbModule.wasm was not found in bin or bin~"); - }; - if found.next().is_some() { - bail!("dotnet publish produced both bin and bin~ outputs; cannot choose the C# wasm"); - } - Ok(path.to_path_buf()) - } } impl Publisher for DotnetPublisher { @@ -364,24 +343,15 @@ impl Publisher for DotnetPublisher { } println!("publish csharp module {}", module_name); - let source = fs::canonicalize(source)?; - Self::ensure_csproj(&source)?; + Self::ensure_csproj(source)?; let db = sanitize_db_name(module_name); let cli_root = isolated_cli_root()?; - let config_name = "Release"; - let mut build_cmd = Command::new("dotnet"); - build_cmd - .arg("publish") - .arg("-c") - .arg(config_name) - .arg("-v") - .arg("quiet") - .current_dir(&source); - Self::configure_dotnet_env(&mut build_cmd); - run(&mut build_cmd, "dotnet publish (csharp)")?; - let wasm = Self::built_wasm(&source, config_name)?; + let mut cmd = spacetime_cmd(&cli_root); + cmd.arg("build").current_dir(source); + Self::configure_dotnet_env(&mut cmd); + run(&mut cmd, "spacetime build (csharp)")?; let mut pubcmd = spacetime_cmd(&cli_root); pubcmd @@ -390,10 +360,9 @@ impl Publisher for DotnetPublisher { .arg("-y") .arg("--server") .arg(host_url) - .arg("--bin-path") - .arg(wasm) .arg(&db) - .current_dir(&source); + .current_dir(source); + Self::configure_dotnet_env(&mut pubcmd); run(&mut pubcmd, "spacetime publish (csharp)")?; Ok(()) From d4999e2e21bd3193ca1d579727d7986e8f50b52c Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 17:12:28 -0400 Subject: [PATCH 12/25] fixes --- .github/workflows/llm-benchmark-periodic.yml | 2 +- .../llm-benchmark-validate-goldens.yml | 2 +- .../src/bench/publishers.rs | 48 +++++++++++++++++-- 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml index 290cfbee325..566e7db82d8 100644 --- a/.github/workflows/llm-benchmark-periodic.yml +++ b/.github/workflows/llm-benchmark-periodic.yml @@ -64,7 +64,7 @@ jobs: - name: Setup .NET SDK uses: actions/setup-dotnet@v4 with: - dotnet-version: "8.0.x" + global-json-file: global.json - name: Install WASI workload env: diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml index a5199cb0bfe..fedbb0c406c 100644 --- a/.github/workflows/llm-benchmark-validate-goldens.yml +++ b/.github/workflows/llm-benchmark-validate-goldens.yml @@ -47,7 +47,7 @@ jobs: if: matrix.lang == 'csharp' uses: actions/setup-dotnet@v4 with: - dotnet-version: "8.0.x" + global-json-file: global.json - name: Install WASI workload if: matrix.lang == 'csharp' diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs index 18d07dfa42b..0c67a0ffff7 100644 --- a/tools/xtask-llm-benchmark/src/bench/publishers.rs +++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs @@ -1,5 +1,5 @@ use crate::bench::utils::sanitize_db_name; -use anyhow::{bail, Result}; +use anyhow::{bail, Context, Result}; use regex::Regex; use std::borrow::Cow; use std::env; @@ -334,6 +334,41 @@ impl DotnetPublisher { .env("MSBUILDDISABLENODEREUSE", "1") .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0") } + + fn built_wasm_path(project_path: &Path) -> Result { + let config_name = "Release"; + let subdir = if env::var_os("EXPERIMENTAL_WASM_AOT").is_some_and(|v| v == "1") { + "publish" + } else { + "AppBundle" + }; + let output_paths = [ + project_path.join(format!("bin/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")), + project_path.join(format!("bin~/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")), + ]; + + let mut found = output_paths.iter().filter(|path| path.exists()).collect::>(); + if found.len() > 1 { + bail!( + "C# build produced multiple StdbModule.wasm outputs in {}", + project_path.display() + ); + } + + let Some(wasm_path) = found.pop() else { + bail!( + "C# build finished but no StdbModule.wasm was found under {}", + project_path.display() + ); + }; + + let optimized_path = wasm_path.with_extension("opt.wasm"); + if optimized_path.exists() { + Ok(optimized_path) + } else { + Ok(wasm_path.to_path_buf()) + } + } } impl Publisher for DotnetPublisher { @@ -346,13 +381,18 @@ impl Publisher for DotnetPublisher { Self::ensure_csproj(source)?; let db = sanitize_db_name(module_name); + let source = source + .canonicalize() + .with_context(|| format!("failed to resolve C# source path {}", source.display()))?; let cli_root = isolated_cli_root()?; let mut cmd = spacetime_cmd(&cli_root); - cmd.arg("build").current_dir(source); + cmd.arg("build").arg("--module-path").arg(&source).current_dir(&source); Self::configure_dotnet_env(&mut cmd); run(&mut cmd, "spacetime build (csharp)")?; + let wasm_path = Self::built_wasm_path(&source)?; + let mut pubcmd = spacetime_cmd(&cli_root); pubcmd .arg("publish") @@ -360,8 +400,10 @@ impl Publisher for DotnetPublisher { .arg("-y") .arg("--server") .arg(host_url) + .arg("--bin-path") + .arg(&wasm_path) .arg(&db) - .current_dir(source); + .current_dir(&source); Self::configure_dotnet_env(&mut pubcmd); run(&mut pubcmd, "spacetime publish (csharp)")?; From e58523f21f4f0828760a774324574e1fbe803fcf Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 17:41:29 -0400 Subject: [PATCH 13/25] Update publishers.rs --- tools/xtask-llm-benchmark/src/bench/publishers.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs index 0c67a0ffff7..f905a789601 100644 --- a/tools/xtask-llm-benchmark/src/bench/publishers.rs +++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs @@ -386,10 +386,15 @@ impl Publisher for DotnetPublisher { .with_context(|| format!("failed to resolve C# source path {}", source.display()))?; let cli_root = isolated_cli_root()?; - let mut cmd = spacetime_cmd(&cli_root); - cmd.arg("build").arg("--module-path").arg(&source).current_dir(&source); + let mut cmd = Command::new("dotnet"); + cmd.arg("publish") + .arg("-c") + .arg("Release") + .arg("-v") + .arg("quiet") + .current_dir(&source); Self::configure_dotnet_env(&mut cmd); - run(&mut cmd, "spacetime build (csharp)")?; + run(&mut cmd, "dotnet publish (csharp)")?; let wasm_path = Self::built_wasm_path(&source)?; From 032afd1797195db76ec03dc990081f7dcafd336e Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 21:08:37 -0400 Subject: [PATCH 14/25] fixes --- .github/workflows/llm-benchmark-periodic.yml | 6 ++ .../llm-benchmark-validate-goldens.yml | 6 ++ .../src/bench/templates.rs | 93 +++++++++++++++++-- .../templates/csharp/server/StdbModule.csproj | 10 +- 4 files changed, 100 insertions(+), 15 deletions(-) diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml index 566e7db82d8..bc456520724 100644 --- a/.github/workflows/llm-benchmark-periodic.yml +++ b/.github/workflows/llm-benchmark-periodic.yml @@ -74,6 +74,12 @@ jobs: run: | dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel + - name: Pack C# runtime packages + if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'csharp') }} + run: | + dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime + dotnet pack -c Release crates/bindings-csharp/Runtime + - name: Set up Node.js uses: actions/setup-node@v4 with: diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml index fedbb0c406c..a2d2ef87a3e 100644 --- a/.github/workflows/llm-benchmark-validate-goldens.yml +++ b/.github/workflows/llm-benchmark-validate-goldens.yml @@ -58,6 +58,12 @@ jobs: run: | dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel + - name: Pack C# runtime packages + if: matrix.lang == 'csharp' + run: | + dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime + dotnet pack -c Release crates/bindings-csharp/Runtime + - name: Set up Node.js if: matrix.lang == 'typescript' uses: actions/setup-node@v4 diff --git a/tools/xtask-llm-benchmark/src/bench/templates.rs b/tools/xtask-llm-benchmark/src/bench/templates.rs index b5fa5f6add3..8ebafc2aefe 100644 --- a/tools/xtask-llm-benchmark/src/bench/templates.rs +++ b/tools/xtask-llm-benchmark/src/bench/templates.rs @@ -159,20 +159,99 @@ fn inject_csharp(root: &Path, llm_code: &str) -> anyhow::Result<()> { } fs::write(&prog, contents).with_context(|| format!("write {}", prog.display()))?; - let base_rel = relative_to_workspace(root, "crates/bindings-csharp")?; let runtime_csproj = workspace_root().join("crates/bindings-csharp/Runtime/Runtime.csproj"); if !runtime_csproj.is_file() { bail!("local C# Runtime not found at {}", runtime_csproj.display()); } - let runtime_ref = format!("{}/Runtime/Runtime.csproj", base_rel); - let runtime_dir = format!("{}/Runtime", base_rel); - let codegen_ref = format!("{}/Codegen/Codegen.csproj", base_rel); + let runtime_version = read_csharp_package_version(&runtime_csproj)?; let csproj_path = root.join("StdbModule.csproj"); let mut csproj = fs::read_to_string(&csproj_path).with_context(|| format!("read {}", csproj_path.display()))?; - csproj = csproj.replace("{SPACETIME_CSHARP_RUNTIME_DIR}", &runtime_dir); - csproj = csproj.replace("{SPACETIME_CSHARP_RUNTIME_REF}", &runtime_ref); - csproj = csproj.replace("{SPACETIME_CSHARP_CODEGEN_REF}", &codegen_ref); + csproj = csproj.replace("{SPACETIME_CSHARP_RUNTIME_VERSION}", &runtime_version); fs::write(&csproj_path, csproj).with_context(|| format!("write {}", csproj_path.display()))?; + + write_csharp_nuget_config(root)?; + Ok(()) +} + +fn read_csharp_package_version(csproj_path: &Path) -> Result { + let contents = fs::read_to_string(csproj_path).with_context(|| format!("read {}", csproj_path.display()))?; + let version = contents + .split("") + .nth(1) + .and_then(|rest| rest.split("").next()) + .map(str::trim) + .filter(|version| !version.is_empty()) + .with_context(|| format!("missing in {}", csproj_path.display()))?; + Ok(version.to_owned()) +} + +fn normalize_nuget_path(path: &Path) -> String { + path.display() + .to_string() + .replace('\\', "/") + .trim_end_matches('/') + .to_string() +} + +fn ensure_csharp_package_source(path: &Path, package_id: &str) -> Result<()> { + let has_package = fs::read_dir(path).ok().into_iter().flatten().flatten().any(|entry| { + entry + .file_name() + .to_str() + .is_some_and(|name| name.starts_with(package_id) && name.ends_with(".nupkg")) + }); + if !has_package { + bail!( + "local C# package {} not found in {}. Run: dotnet pack -c Release crates/bindings-csharp/{}", + package_id, + path.display(), + package_id.strip_prefix("SpacetimeDB.").unwrap_or(package_id) + ); + } + Ok(()) +} + +fn write_csharp_nuget_config(root: &Path) -> Result<()> { + let workspace = workspace_root(); + let runtime_source = workspace.join("crates/bindings-csharp/Runtime/bin/Release"); + let bsatn_source = workspace.join("crates/bindings-csharp/BSATN.Runtime/bin/Release"); + + ensure_csharp_package_source(&runtime_source, "SpacetimeDB.Runtime")?; + ensure_csharp_package_source(&bsatn_source, "SpacetimeDB.BSATN.Runtime")?; + + let package_cache = root.join(".nuget/packages"); + let nuget_config = format!( + r#" + + + + + + + + + + + + + + + + + + + + + + +"#, + normalize_nuget_path(&package_cache), + normalize_nuget_path(&runtime_source), + normalize_nuget_path(&bsatn_source), + ); + + fs::write(root.join("nuget.config"), nuget_config) + .with_context(|| format!("write {}", root.join("nuget.config").display()))?; Ok(()) } diff --git a/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj b/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj index ce04141c7a0..f286932badd 100644 --- a/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj +++ b/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj @@ -1,9 +1,5 @@ - - - - net8.0 wasi-wasm @@ -12,9 +8,7 @@ - - - + - \ No newline at end of file + From 603741817b123aa41a54f90d4fac96f432e833c2 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 21:39:05 -0400 Subject: [PATCH 15/25] match smoketest (fingers crossed?) --- .github/workflows/llm-benchmark-periodic.yml | 6 - .../llm-benchmark-validate-goldens.yml | 6 - .../src/bench/publishers.rs | 70 ++--------- .../src/bench/templates.rs | 111 +++++++++++++----- tools/xtask-llm-benchmark/src/bench/utils.rs | 6 +- 5 files changed, 97 insertions(+), 102 deletions(-) diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml index bc456520724..566e7db82d8 100644 --- a/.github/workflows/llm-benchmark-periodic.yml +++ b/.github/workflows/llm-benchmark-periodic.yml @@ -74,12 +74,6 @@ jobs: run: | dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel - - name: Pack C# runtime packages - if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'csharp') }} - run: | - dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime - dotnet pack -c Release crates/bindings-csharp/Runtime - - name: Set up Node.js uses: actions/setup-node@v4 with: diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml index a2d2ef87a3e..fedbb0c406c 100644 --- a/.github/workflows/llm-benchmark-validate-goldens.yml +++ b/.github/workflows/llm-benchmark-validate-goldens.yml @@ -58,12 +58,6 @@ jobs: run: | dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel - - name: Pack C# runtime packages - if: matrix.lang == 'csharp' - run: | - dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime - dotnet pack -c Release crates/bindings-csharp/Runtime - - name: Set up Node.js if: matrix.lang == 'typescript' uses: actions/setup-node@v4 diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs index f905a789601..6109b872314 100644 --- a/tools/xtask-llm-benchmark/src/bench/publishers.rs +++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs @@ -211,14 +211,14 @@ pub trait Publisher: Send + Sync { /// Check if the process was killed by a signal (e.g., SIGSEGV = 11) #[cfg(unix)] -fn was_signal_killed(status: &std::process::ExitStatus) -> bool { +fn signal_killed_by(status: &std::process::ExitStatus) -> Option { use std::os::unix::process::ExitStatusExt; - status.signal().is_some() + status.signal() } #[cfg(not(unix))] -fn was_signal_killed(_status: &std::process::ExitStatus) -> bool { - false +fn signal_killed_by(_status: &std::process::ExitStatus) -> Option { + None } /// Check if the failure is a transient error that should be retried. @@ -282,13 +282,14 @@ fn run_with_retry(cmd: &mut Command, label: &str, max_retries: u32) -> Result<() let stderr = strip_ansi_codes(&stderr_raw); let stdout = strip_ansi_codes(&stdout_raw); - // Retry on signal kills (like SIGSEGV) or transient build errors - let should_retry = was_signal_killed(&out.status) || is_transient_build_error(&stderr, &stdout); + // Retry on signal kills (like SIGSEGV) or transient build errors. + let signal = signal_killed_by(&out.status); + let should_retry = signal.is_some() || is_transient_build_error(&stderr, &stdout); if should_retry && attempt < max_retries { - let reason = if was_signal_killed(&out.status) { - "signal kill" + let reason = if let Some(signal) = signal { + format!("signal {signal}") } else { - "transient build error" + "transient build error".to_string() }; eprintln!("⚠️ {label}: {reason} detected, will retry..."); last_error = Some(format!( @@ -334,41 +335,6 @@ impl DotnetPublisher { .env("MSBUILDDISABLENODEREUSE", "1") .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0") } - - fn built_wasm_path(project_path: &Path) -> Result { - let config_name = "Release"; - let subdir = if env::var_os("EXPERIMENTAL_WASM_AOT").is_some_and(|v| v == "1") { - "publish" - } else { - "AppBundle" - }; - let output_paths = [ - project_path.join(format!("bin/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")), - project_path.join(format!("bin~/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")), - ]; - - let mut found = output_paths.iter().filter(|path| path.exists()).collect::>(); - if found.len() > 1 { - bail!( - "C# build produced multiple StdbModule.wasm outputs in {}", - project_path.display() - ); - } - - let Some(wasm_path) = found.pop() else { - bail!( - "C# build finished but no StdbModule.wasm was found under {}", - project_path.display() - ); - }; - - let optimized_path = wasm_path.with_extension("opt.wasm"); - if optimized_path.exists() { - Ok(optimized_path) - } else { - Ok(wasm_path.to_path_buf()) - } - } } impl Publisher for DotnetPublisher { @@ -386,18 +352,6 @@ impl Publisher for DotnetPublisher { .with_context(|| format!("failed to resolve C# source path {}", source.display()))?; let cli_root = isolated_cli_root()?; - let mut cmd = Command::new("dotnet"); - cmd.arg("publish") - .arg("-c") - .arg("Release") - .arg("-v") - .arg("quiet") - .current_dir(&source); - Self::configure_dotnet_env(&mut cmd); - run(&mut cmd, "dotnet publish (csharp)")?; - - let wasm_path = Self::built_wasm_path(&source)?; - let mut pubcmd = spacetime_cmd(&cli_root); pubcmd .arg("publish") @@ -405,8 +359,8 @@ impl Publisher for DotnetPublisher { .arg("-y") .arg("--server") .arg(host_url) - .arg("--bin-path") - .arg(&wasm_path) + .arg("--module-path") + .arg(&source) .arg(&db) .current_dir(&source); Self::configure_dotnet_env(&mut pubcmd); diff --git a/tools/xtask-llm-benchmark/src/bench/templates.rs b/tools/xtask-llm-benchmark/src/bench/templates.rs index 8ebafc2aefe..e6d8938481f 100644 --- a/tools/xtask-llm-benchmark/src/bench/templates.rs +++ b/tools/xtask-llm-benchmark/src/bench/templates.rs @@ -3,8 +3,17 @@ use anyhow::{bail, Context, Result}; use std::{ env, fs, io, path::{Path, PathBuf}, + process::Command, + sync::OnceLock, }; +const CSHARP_PACKAGE_PROJECTS: [(&str, &str); 2] = [ + ("BSATN.Runtime", "SpacetimeDB.BSATN.Runtime"), + ("Runtime", "SpacetimeDB.Runtime"), +]; + +static CSHARP_LOCAL_FEED: OnceLock> = OnceLock::new(); + pub fn materialize_project( lang: &str, category: &str, @@ -193,33 +202,82 @@ fn normalize_nuget_path(path: &Path) -> String { .to_string() } -fn ensure_csharp_package_source(path: &Path, package_id: &str) -> Result<()> { - let has_package = fs::read_dir(path).ok().into_iter().flatten().flatten().any(|entry| { - entry - .file_name() - .to_str() - .is_some_and(|name| name.starts_with(package_id) && name.ends_with(".nupkg")) - }); - if !has_package { - bail!( - "local C# package {} not found in {}. Run: dotnet pack -c Release crates/bindings-csharp/{}", - package_id, - path.display(), - package_id.strip_prefix("SpacetimeDB.").unwrap_or(package_id) - ); +fn run_dotnet(mut cmd: Command, label: &str) -> Result<()> { + let debug = format!("{cmd:?}"); + let output = cmd + .output() + .with_context(|| format!("failed to run {label}: {debug}"))?; + if output.status.success() { + return Ok(()); } - Ok(()) + bail!( + "{label} failed: {debug}\n--- stderr ---\n{}\n--- stdout ---\n{}", + String::from_utf8_lossy(&output.stderr), + String::from_utf8_lossy(&output.stdout) + ); } -fn write_csharp_nuget_config(root: &Path) -> Result<()> { +fn csharp_local_feed() -> Result { + match CSHARP_LOCAL_FEED.get_or_init(|| build_csharp_local_feed().map_err(|err| format!("{err:#}"))) { + Ok(path) => Ok(path.clone()), + Err(err) => bail!("{err}"), + } +} + +fn build_csharp_local_feed() -> Result { let workspace = workspace_root(); - let runtime_source = workspace.join("crates/bindings-csharp/Runtime/bin/Release"); - let bsatn_source = workspace.join("crates/bindings-csharp/BSATN.Runtime/bin/Release"); + let bindings = workspace.join("crates/bindings-csharp"); + let local_feed = workspace.join("target/llm-benchmark-csharp/local-feed"); - ensure_csharp_package_source(&runtime_source, "SpacetimeDB.Runtime")?; - ensure_csharp_package_source(&bsatn_source, "SpacetimeDB.BSATN.Runtime")?; + if local_feed.exists() { + fs::remove_dir_all(&local_feed).with_context(|| format!("remove {}", local_feed.display()))?; + } + fs::create_dir_all(&local_feed).with_context(|| format!("create {}", local_feed.display()))?; + + for (project_dir, _) in CSHARP_PACKAGE_PROJECTS { + let mut cmd = Command::new("dotnet"); + cmd.arg("pack") + .arg("-c") + .arg("Release") + .arg("-o") + .arg(&local_feed) + .current_dir(bindings.join(project_dir)); + run_dotnet(cmd, &format!("dotnet pack {project_dir}"))?; + } + + let feed_files = fs::read_dir(&local_feed) + .with_context(|| format!("inspect {}", local_feed.display()))? + .flatten() + .filter_map(|entry| entry.file_name().into_string().ok()) + .collect::>(); + + for (_, package_id) in CSHARP_PACKAGE_PROJECTS { + let package_prefix = format!("{package_id}."); + if !feed_files + .iter() + .any(|name| name.starts_with(&package_prefix) && name.ends_with(".nupkg")) + { + bail!( + "local C# feed at {} is missing package {}. Found files: {:?}", + local_feed.display(), + package_id, + feed_files + ); + } + } + + Ok(local_feed) +} + +fn write_csharp_nuget_config(root: &Path) -> Result<()> { + let local_feed = csharp_local_feed()?; let package_cache = root.join(".nuget/packages"); + if package_cache.exists() { + fs::remove_dir_all(&package_cache).with_context(|| format!("remove {}", package_cache.display()))?; + } + fs::create_dir_all(&package_cache).with_context(|| format!("create {}", package_cache.display()))?; + let nuget_config = format!( r#" @@ -228,16 +286,12 @@ fn write_csharp_nuget_config(root: &Path) -> Result<()> { - - + - - - - - + + @@ -246,8 +300,7 @@ fn write_csharp_nuget_config(root: &Path) -> Result<()> { "#, normalize_nuget_path(&package_cache), - normalize_nuget_path(&runtime_source), - normalize_nuget_path(&bsatn_source), + normalize_nuget_path(&local_feed), ); fs::write(root.join("nuget.config"), nuget_config) diff --git a/tools/xtask-llm-benchmark/src/bench/utils.rs b/tools/xtask-llm-benchmark/src/bench/utils.rs index a8ccddc23e5..6e28315e4f6 100644 --- a/tools/xtask-llm-benchmark/src/bench/utils.rs +++ b/tools/xtask-llm-benchmark/src/bench/utils.rs @@ -109,13 +109,13 @@ pub fn bench_rust_concurrency() -> usize { .unwrap_or(2) } -/// Concurrency for C# builds. Lower default than Rust due to dotnet/WASI SDK -/// instability under high parallelism (causes SIGSEGV and "Pipe is broken" errors). +/// Concurrency for C# builds. Keep this serialized to match smoketest behavior; +/// dotnet/WASI SDK builds are fragile when multiple generated modules publish at once. pub fn bench_csharp_concurrency() -> usize { env::var("LLM_BENCH_CSHARP_CONCURRENCY") .ok() .and_then(|s| s.parse().ok()) - .unwrap_or(4) + .unwrap_or(1) } pub fn bench_route_concurrency() -> usize { From 2e6e02fe97bdfd7b23cb68d84fe007fc39189c37 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 21:55:18 -0400 Subject: [PATCH 16/25] fix --- .github/workflows/llm-benchmark-periodic.yml | 6 + .../llm-benchmark-validate-goldens.yml | 6 + .../src/bench/templates.rs | 106 +++++------------- 3 files changed, 41 insertions(+), 77 deletions(-) diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml index 566e7db82d8..bc456520724 100644 --- a/.github/workflows/llm-benchmark-periodic.yml +++ b/.github/workflows/llm-benchmark-periodic.yml @@ -74,6 +74,12 @@ jobs: run: | dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel + - name: Pack C# runtime packages + if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'csharp') }} + run: | + dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime + dotnet pack -c Release crates/bindings-csharp/Runtime + - name: Set up Node.js uses: actions/setup-node@v4 with: diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml index fedbb0c406c..a2d2ef87a3e 100644 --- a/.github/workflows/llm-benchmark-validate-goldens.yml +++ b/.github/workflows/llm-benchmark-validate-goldens.yml @@ -58,6 +58,12 @@ jobs: run: | dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel + - name: Pack C# runtime packages + if: matrix.lang == 'csharp' + run: | + dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime + dotnet pack -c Release crates/bindings-csharp/Runtime + - name: Set up Node.js if: matrix.lang == 'typescript' uses: actions/setup-node@v4 diff --git a/tools/xtask-llm-benchmark/src/bench/templates.rs b/tools/xtask-llm-benchmark/src/bench/templates.rs index e6d8938481f..35176de8200 100644 --- a/tools/xtask-llm-benchmark/src/bench/templates.rs +++ b/tools/xtask-llm-benchmark/src/bench/templates.rs @@ -3,17 +3,8 @@ use anyhow::{bail, Context, Result}; use std::{ env, fs, io, path::{Path, PathBuf}, - process::Command, - sync::OnceLock, }; -const CSHARP_PACKAGE_PROJECTS: [(&str, &str); 2] = [ - ("BSATN.Runtime", "SpacetimeDB.BSATN.Runtime"), - ("Runtime", "SpacetimeDB.Runtime"), -]; - -static CSHARP_LOCAL_FEED: OnceLock> = OnceLock::new(); - pub fn materialize_project( lang: &str, category: &str, @@ -202,75 +193,31 @@ fn normalize_nuget_path(path: &Path) -> String { .to_string() } -fn run_dotnet(mut cmd: Command, label: &str) -> Result<()> { - let debug = format!("{cmd:?}"); - let output = cmd - .output() - .with_context(|| format!("failed to run {label}: {debug}"))?; - if output.status.success() { - return Ok(()); - } - bail!( - "{label} failed: {debug}\n--- stderr ---\n{}\n--- stdout ---\n{}", - String::from_utf8_lossy(&output.stderr), - String::from_utf8_lossy(&output.stdout) - ); -} - -fn csharp_local_feed() -> Result { - match CSHARP_LOCAL_FEED.get_or_init(|| build_csharp_local_feed().map_err(|err| format!("{err:#}"))) { - Ok(path) => Ok(path.clone()), - Err(err) => bail!("{err}"), +fn ensure_csharp_package_source(path: &Path, package_id: &str) -> Result<()> { + let has_package = fs::read_dir(path).ok().into_iter().flatten().flatten().any(|entry| { + entry + .file_name() + .to_str() + .is_some_and(|name| name.starts_with(package_id) && name.ends_with(".nupkg")) + }); + if !has_package { + bail!( + "local C# package {} not found in {}. Run: dotnet pack -c Release crates/bindings-csharp/{}", + package_id, + path.display(), + package_id.strip_prefix("SpacetimeDB.").unwrap_or(package_id) + ); } + Ok(()) } -fn build_csharp_local_feed() -> Result { +fn write_csharp_nuget_config(root: &Path) -> Result<()> { let workspace = workspace_root(); - let bindings = workspace.join("crates/bindings-csharp"); - let local_feed = workspace.join("target/llm-benchmark-csharp/local-feed"); + let runtime_source = workspace.join("crates/bindings-csharp/Runtime/bin/Release"); + let bsatn_source = workspace.join("crates/bindings-csharp/BSATN.Runtime/bin/Release"); - if local_feed.exists() { - fs::remove_dir_all(&local_feed).with_context(|| format!("remove {}", local_feed.display()))?; - } - fs::create_dir_all(&local_feed).with_context(|| format!("create {}", local_feed.display()))?; - - for (project_dir, _) in CSHARP_PACKAGE_PROJECTS { - let mut cmd = Command::new("dotnet"); - cmd.arg("pack") - .arg("-c") - .arg("Release") - .arg("-o") - .arg(&local_feed) - .current_dir(bindings.join(project_dir)); - run_dotnet(cmd, &format!("dotnet pack {project_dir}"))?; - } - - let feed_files = fs::read_dir(&local_feed) - .with_context(|| format!("inspect {}", local_feed.display()))? - .flatten() - .filter_map(|entry| entry.file_name().into_string().ok()) - .collect::>(); - - for (_, package_id) in CSHARP_PACKAGE_PROJECTS { - let package_prefix = format!("{package_id}."); - if !feed_files - .iter() - .any(|name| name.starts_with(&package_prefix) && name.ends_with(".nupkg")) - { - bail!( - "local C# feed at {} is missing package {}. Found files: {:?}", - local_feed.display(), - package_id, - feed_files - ); - } - } - - Ok(local_feed) -} - -fn write_csharp_nuget_config(root: &Path) -> Result<()> { - let local_feed = csharp_local_feed()?; + ensure_csharp_package_source(&runtime_source, "SpacetimeDB.Runtime")?; + ensure_csharp_package_source(&bsatn_source, "SpacetimeDB.BSATN.Runtime")?; let package_cache = root.join(".nuget/packages"); if package_cache.exists() { @@ -286,12 +233,16 @@ fn write_csharp_nuget_config(root: &Path) -> Result<()> { - + + - - + + + + + @@ -300,7 +251,8 @@ fn write_csharp_nuget_config(root: &Path) -> Result<()> { "#, normalize_nuget_path(&package_cache), - normalize_nuget_path(&local_feed), + normalize_nuget_path(&runtime_source), + normalize_nuget_path(&bsatn_source), ); fs::write(root.join("nuget.config"), nuget_config) From b2308b1a6c906d0155359d83ff32bb61ba339fa4 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 22:09:02 -0400 Subject: [PATCH 17/25] shrug --- crates/cli/src/tasks/csharp.rs | 87 ++++++++++--------- .../src/bench/publishers.rs | 3 + 2 files changed, 51 insertions(+), 39 deletions(-) diff --git a/crates/cli/src/tasks/csharp.rs b/crates/cli/src/tasks/csharp.rs index 5df8b730448..7f76b3199d2 100644 --- a/crates/cli/src/tasks/csharp.rs +++ b/crates/cli/src/tasks/csharp.rs @@ -1,5 +1,6 @@ use anyhow::Context; use itertools::Itertools; +use std::env; use std::ffi::OsString; use std::fs; use std::path::{Path, PathBuf}; @@ -8,6 +9,12 @@ fn parse_major_version(version: &str) -> Option { version.split('.').next()?.parse::().ok() } +fn skip_workload_check() -> bool { + env::var("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK") + .ok() + .is_some_and(|value| matches!(value.as_str(), "1" | "true" | "TRUE" | "yes" | "YES")) +} + pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Result { // All `dotnet` commands must execute in the project directory, otherwise // global.json won't have any effect and wrong .NET SDK might be picked. @@ -17,46 +24,48 @@ pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Re }; } - // Check if the `wasi-experimental` workload is installed. Unfortunately, we - // have to do this by inspecting the human-readable output. There is a - // hidden `--machine-readable` flag but it also mixes in human-readable - // output as well as unnecessarily updates various unrelated manifests. - match dotnet!("workload", "list").read() { - Ok(workloads) if workloads.contains("wasi-experimental") => {} - Ok(_) => { - // If wasi-experimental is not found, first check if we're running - // on .NET SDK 8.0. We can't even install that workload on older - // versions, and we don't support .NET 9.0 yet, so this helps to - // provide a nicer message than "Workload ID wasi-experimental is not recognized.". - let version = dotnet!("--version").read().unwrap_or_default(); - if parse_major_version(&version) != Some(8) { - anyhow::bail!(concat!( - ".NET SDK 8.0 is required, but found {version}.\n", - "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json." - )); - } + if !skip_workload_check() { + // Check if the `wasi-experimental` workload is installed. Unfortunately, we + // have to do this by inspecting the human-readable output. There is a + // hidden `--machine-readable` flag but it also mixes in human-readable + // output as well as unnecessarily updates various unrelated manifests. + match dotnet!("workload", "list").read() { + Ok(workloads) if workloads.contains("wasi-experimental") => {} + Ok(_) => { + // If wasi-experimental is not found, first check if we're running + // on .NET SDK 8.0. We can't even install that workload on older + // versions, and we don't support .NET 9.0 yet, so this helps to + // provide a nicer message than "Workload ID wasi-experimental is not recognized.". + let version = dotnet!("--version").read().unwrap_or_default(); + if parse_major_version(&version) != Some(8) { + anyhow::bail!(concat!( + ".NET SDK 8.0 is required, but found {version}.\n", + "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json." + )); + } - // Finally, try to install the workload ourselves. On some systems - // this might require elevated privileges, so print a nice error - // message if it fails. - dotnet!( - "workload", - "install", - "wasi-experimental", - "--skip-manifest-update" - ) - .stderr_capture() - .run() - .context(concat!( - "Couldn't install the required wasi-experimental workload.\n", - "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights." - ))?; - } - Err(error) if error.kind() == std::io::ErrorKind::NotFound => { - anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.") - } - Err(error) => anyhow::bail!("{error}"), - }; + // Finally, try to install the workload ourselves. On some systems + // this might require elevated privileges, so print a nice error + // message if it fails. + dotnet!( + "workload", + "install", + "wasi-experimental", + "--skip-manifest-update" + ) + .stderr_capture() + .run() + .context(concat!( + "Couldn't install the required wasi-experimental workload.\n", + "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights." + ))?; + } + Err(error) if error.kind() == std::io::ErrorKind::NotFound => { + anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.") + } + Err(error) => anyhow::bail!("{error}"), + }; + } let config_name = if build_debug { "Debug" } else { "Release" }; diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs index 6109b872314..11fc75bbac5 100644 --- a/tools/xtask-llm-benchmark/src/bench/publishers.rs +++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs @@ -334,6 +334,9 @@ impl DotnetPublisher { // when running multiple dotnet builds in parallel. .env("MSBUILDDISABLENODEREUSE", "1") .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0") + // The workflow installs the WASI workload before running benchmarks. + // Avoid `dotnet workload list`, which can segfault on the CI runner. + .env("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK", "1") } } From 2b133b8bc9630ecf7268c3ff8c950878cfc0246a Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 22:19:06 -0400 Subject: [PATCH 18/25] fix? --- crates/cli/src/tasks/csharp.rs | 29 ++++++++++++++++--- .../src/bench/publishers.rs | 2 ++ 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/crates/cli/src/tasks/csharp.rs b/crates/cli/src/tasks/csharp.rs index 7f76b3199d2..f9337cc42aa 100644 --- a/crates/cli/src/tasks/csharp.rs +++ b/crates/cli/src/tasks/csharp.rs @@ -9,12 +9,20 @@ fn parse_major_version(version: &str) -> Option { version.split('.').next()?.parse::().ok() } -fn skip_workload_check() -> bool { - env::var("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK") +fn env_flag(name: &str) -> bool { + env::var(name) .ok() .is_some_and(|value| matches!(value.as_str(), "1" | "true" | "TRUE" | "yes" | "YES")) } +fn skip_workload_check() -> bool { + env_flag("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK") +} + +fn stable_dotnet_publish() -> bool { + env_flag("SPACETIMEDB_CSHARP_STABLE_PUBLISH") +} + pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Result { // All `dotnet` commands must execute in the project directory, otherwise // global.json won't have any effect and wrong .NET SDK might be picked. @@ -77,8 +85,21 @@ pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Re ) })?; - // run dotnet publish using cmd macro - dotnet!("publish", "-c", config_name, "-v", "quiet").run()?; + let mut publish_args = vec!["publish", "-c", config_name, "-v"]; + if stable_dotnet_publish() { + publish_args.extend([ + "minimal", + "--disable-build-servers", + "-m:1", + "-p:BuildInParallel=false", + "-p:RestoreDisableParallel=true", + "-p:UseSharedCompilation=false", + ]); + } else { + publish_args.push("quiet"); + } + + duct::cmd("dotnet", publish_args).dir(project_path).run()?; // check if file exists let subdir = if std::env::var_os("EXPERIMENTAL_WASM_AOT").is_some_and(|v| v == "1") { diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs index 11fc75bbac5..700cb955ea0 100644 --- a/tools/xtask-llm-benchmark/src/bench/publishers.rs +++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs @@ -337,6 +337,8 @@ impl DotnetPublisher { // The workflow installs the WASI workload before running benchmarks. // Avoid `dotnet workload list`, which can segfault on the CI runner. .env("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK", "1") + // Keep benchmark C# publishes on the conservative MSBuild path. + .env("SPACETIMEDB_CSHARP_STABLE_PUBLISH", "1") } } From 7857671200ad19e0f926925a5c7bd4e213c657e5 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 22:51:54 -0400 Subject: [PATCH 19/25] testing --- crates/cli/src/tasks/csharp.rs | 112 +++++++----------- .../src/bench/publishers.rs | 5 - .../src/bin/llm_benchmark.rs | 37 ++---- 3 files changed, 53 insertions(+), 101 deletions(-) diff --git a/crates/cli/src/tasks/csharp.rs b/crates/cli/src/tasks/csharp.rs index f9337cc42aa..5df8b730448 100644 --- a/crates/cli/src/tasks/csharp.rs +++ b/crates/cli/src/tasks/csharp.rs @@ -1,6 +1,5 @@ use anyhow::Context; use itertools::Itertools; -use std::env; use std::ffi::OsString; use std::fs; use std::path::{Path, PathBuf}; @@ -9,20 +8,6 @@ fn parse_major_version(version: &str) -> Option { version.split('.').next()?.parse::().ok() } -fn env_flag(name: &str) -> bool { - env::var(name) - .ok() - .is_some_and(|value| matches!(value.as_str(), "1" | "true" | "TRUE" | "yes" | "YES")) -} - -fn skip_workload_check() -> bool { - env_flag("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK") -} - -fn stable_dotnet_publish() -> bool { - env_flag("SPACETIMEDB_CSHARP_STABLE_PUBLISH") -} - pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Result { // All `dotnet` commands must execute in the project directory, otherwise // global.json won't have any effect and wrong .NET SDK might be picked. @@ -32,48 +17,46 @@ pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Re }; } - if !skip_workload_check() { - // Check if the `wasi-experimental` workload is installed. Unfortunately, we - // have to do this by inspecting the human-readable output. There is a - // hidden `--machine-readable` flag but it also mixes in human-readable - // output as well as unnecessarily updates various unrelated manifests. - match dotnet!("workload", "list").read() { - Ok(workloads) if workloads.contains("wasi-experimental") => {} - Ok(_) => { - // If wasi-experimental is not found, first check if we're running - // on .NET SDK 8.0. We can't even install that workload on older - // versions, and we don't support .NET 9.0 yet, so this helps to - // provide a nicer message than "Workload ID wasi-experimental is not recognized.". - let version = dotnet!("--version").read().unwrap_or_default(); - if parse_major_version(&version) != Some(8) { - anyhow::bail!(concat!( - ".NET SDK 8.0 is required, but found {version}.\n", - "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json." - )); - } - - // Finally, try to install the workload ourselves. On some systems - // this might require elevated privileges, so print a nice error - // message if it fails. - dotnet!( - "workload", - "install", - "wasi-experimental", - "--skip-manifest-update" - ) - .stderr_capture() - .run() - .context(concat!( - "Couldn't install the required wasi-experimental workload.\n", - "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights." - ))?; + // Check if the `wasi-experimental` workload is installed. Unfortunately, we + // have to do this by inspecting the human-readable output. There is a + // hidden `--machine-readable` flag but it also mixes in human-readable + // output as well as unnecessarily updates various unrelated manifests. + match dotnet!("workload", "list").read() { + Ok(workloads) if workloads.contains("wasi-experimental") => {} + Ok(_) => { + // If wasi-experimental is not found, first check if we're running + // on .NET SDK 8.0. We can't even install that workload on older + // versions, and we don't support .NET 9.0 yet, so this helps to + // provide a nicer message than "Workload ID wasi-experimental is not recognized.". + let version = dotnet!("--version").read().unwrap_or_default(); + if parse_major_version(&version) != Some(8) { + anyhow::bail!(concat!( + ".NET SDK 8.0 is required, but found {version}.\n", + "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json." + )); } - Err(error) if error.kind() == std::io::ErrorKind::NotFound => { - anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.") - } - Err(error) => anyhow::bail!("{error}"), - }; - } + + // Finally, try to install the workload ourselves. On some systems + // this might require elevated privileges, so print a nice error + // message if it fails. + dotnet!( + "workload", + "install", + "wasi-experimental", + "--skip-manifest-update" + ) + .stderr_capture() + .run() + .context(concat!( + "Couldn't install the required wasi-experimental workload.\n", + "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights." + ))?; + } + Err(error) if error.kind() == std::io::ErrorKind::NotFound => { + anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.") + } + Err(error) => anyhow::bail!("{error}"), + }; let config_name = if build_debug { "Debug" } else { "Release" }; @@ -85,21 +68,8 @@ pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Re ) })?; - let mut publish_args = vec!["publish", "-c", config_name, "-v"]; - if stable_dotnet_publish() { - publish_args.extend([ - "minimal", - "--disable-build-servers", - "-m:1", - "-p:BuildInParallel=false", - "-p:RestoreDisableParallel=true", - "-p:UseSharedCompilation=false", - ]); - } else { - publish_args.push("quiet"); - } - - duct::cmd("dotnet", publish_args).dir(project_path).run()?; + // run dotnet publish using cmd macro + dotnet!("publish", "-c", config_name, "-v", "quiet").run()?; // check if file exists let subdir = if std::env::var_os("EXPERIMENTAL_WASM_AOT").is_some_and(|v| v == "1") { diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs index 700cb955ea0..6109b872314 100644 --- a/tools/xtask-llm-benchmark/src/bench/publishers.rs +++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs @@ -334,11 +334,6 @@ impl DotnetPublisher { // when running multiple dotnet builds in parallel. .env("MSBUILDDISABLENODEREUSE", "1") .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0") - // The workflow installs the WASI workload before running benchmarks. - // Avoid `dotnet workload list`, which can segfault on the CI runner. - .env("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK", "1") - // Keep benchmark C# publishes on the conservative MSBuild path. - .env("SPACETIMEDB_CSHARP_STABLE_PUBLISH", "1") } } diff --git a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs index 179be601634..6ec030a49e8 100644 --- a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs +++ b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs @@ -279,11 +279,7 @@ fn run_benchmarks(args: RunArgs) -> Result<()> { eprintln!("[warn] failed to upload task catalog: {e}"); } - let RuntimeInit { - runtime, - provider: llm_provider, - guard, - } = initialize_runtime_and_provider(config.hash_only, config.goldens_only)?; + let RuntimeInit { runtime, guard } = initialize_runtime(config.hash_only)?; config.host = guard.as_ref().map(|g| g.host_url.clone()); @@ -309,12 +305,7 @@ fn run_benchmarks(args: RunArgs) -> Result<()> { return Ok(()); } - if !config.goldens_only && !config.hash_only { - let rt = runtime.as_ref().expect("failed to initialize runtime for preflight"); - let provider = llm_provider.as_ref().expect("llm provider required for preflight"); - let routes = filter_routes(&config); - preflight_llm_routes(rt, provider.as_ref(), &routes, &modes)?; - + let llm_provider = if !config.goldens_only && !config.hash_only { let rt = runtime.as_ref().expect("failed to initialize runtime for goldens"); rt.block_on(ensure_goldens_built_once( config.host.clone(), @@ -322,7 +313,15 @@ fn run_benchmarks(args: RunArgs) -> Result<()> { config.lang, selectors_ref, ))?; - } + + let provider = make_provider_from_env()?; + let rt = runtime.as_ref().expect("failed to initialize runtime for preflight"); + let routes = filter_routes(&config); + preflight_llm_routes(rt, provider.as_ref(), &routes, &modes)?; + Some(provider) + } else { + None + }; let mut all_outcomes: Vec = Vec::new(); @@ -799,15 +798,13 @@ fn categories_to_set(v: Option>) -> Option> { pub struct RuntimeInit { pub runtime: Option, - pub provider: Option>, pub guard: Option, } -fn initialize_runtime_and_provider(hash_only: bool, goldens_only: bool) -> Result { +fn initialize_runtime(hash_only: bool) -> Result { if hash_only { return Ok(RuntimeInit { runtime: None, - provider: None, guard: None, }); } @@ -817,18 +814,8 @@ fn initialize_runtime_and_provider(hash_only: bool, goldens_only: bool) -> Resul let runtime = tokio::runtime::Builder::new_multi_thread().enable_all().build()?; - if goldens_only { - return Ok(RuntimeInit { - runtime: Some(runtime), - provider: None, - guard: Some(spacetime), - }); - } - - let llm_provider = make_provider_from_env()?; Ok(RuntimeInit { runtime: Some(runtime), - provider: Some(llm_provider), guard: Some(spacetime), }) } From ee38f7a9d3d8a352afb749b15ccd749ec58d12f0 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 23:02:23 -0400 Subject: [PATCH 20/25] test --- crates/cli/src/tasks/csharp.rs | 87 ++++++++++--------- .../src/bench/publishers.rs | 3 + 2 files changed, 51 insertions(+), 39 deletions(-) diff --git a/crates/cli/src/tasks/csharp.rs b/crates/cli/src/tasks/csharp.rs index 5df8b730448..7f76b3199d2 100644 --- a/crates/cli/src/tasks/csharp.rs +++ b/crates/cli/src/tasks/csharp.rs @@ -1,5 +1,6 @@ use anyhow::Context; use itertools::Itertools; +use std::env; use std::ffi::OsString; use std::fs; use std::path::{Path, PathBuf}; @@ -8,6 +9,12 @@ fn parse_major_version(version: &str) -> Option { version.split('.').next()?.parse::().ok() } +fn skip_workload_check() -> bool { + env::var("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK") + .ok() + .is_some_and(|value| matches!(value.as_str(), "1" | "true" | "TRUE" | "yes" | "YES")) +} + pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Result { // All `dotnet` commands must execute in the project directory, otherwise // global.json won't have any effect and wrong .NET SDK might be picked. @@ -17,46 +24,48 @@ pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Re }; } - // Check if the `wasi-experimental` workload is installed. Unfortunately, we - // have to do this by inspecting the human-readable output. There is a - // hidden `--machine-readable` flag but it also mixes in human-readable - // output as well as unnecessarily updates various unrelated manifests. - match dotnet!("workload", "list").read() { - Ok(workloads) if workloads.contains("wasi-experimental") => {} - Ok(_) => { - // If wasi-experimental is not found, first check if we're running - // on .NET SDK 8.0. We can't even install that workload on older - // versions, and we don't support .NET 9.0 yet, so this helps to - // provide a nicer message than "Workload ID wasi-experimental is not recognized.". - let version = dotnet!("--version").read().unwrap_or_default(); - if parse_major_version(&version) != Some(8) { - anyhow::bail!(concat!( - ".NET SDK 8.0 is required, but found {version}.\n", - "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json." - )); - } + if !skip_workload_check() { + // Check if the `wasi-experimental` workload is installed. Unfortunately, we + // have to do this by inspecting the human-readable output. There is a + // hidden `--machine-readable` flag but it also mixes in human-readable + // output as well as unnecessarily updates various unrelated manifests. + match dotnet!("workload", "list").read() { + Ok(workloads) if workloads.contains("wasi-experimental") => {} + Ok(_) => { + // If wasi-experimental is not found, first check if we're running + // on .NET SDK 8.0. We can't even install that workload on older + // versions, and we don't support .NET 9.0 yet, so this helps to + // provide a nicer message than "Workload ID wasi-experimental is not recognized.". + let version = dotnet!("--version").read().unwrap_or_default(); + if parse_major_version(&version) != Some(8) { + anyhow::bail!(concat!( + ".NET SDK 8.0 is required, but found {version}.\n", + "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json." + )); + } - // Finally, try to install the workload ourselves. On some systems - // this might require elevated privileges, so print a nice error - // message if it fails. - dotnet!( - "workload", - "install", - "wasi-experimental", - "--skip-manifest-update" - ) - .stderr_capture() - .run() - .context(concat!( - "Couldn't install the required wasi-experimental workload.\n", - "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights." - ))?; - } - Err(error) if error.kind() == std::io::ErrorKind::NotFound => { - anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.") - } - Err(error) => anyhow::bail!("{error}"), - }; + // Finally, try to install the workload ourselves. On some systems + // this might require elevated privileges, so print a nice error + // message if it fails. + dotnet!( + "workload", + "install", + "wasi-experimental", + "--skip-manifest-update" + ) + .stderr_capture() + .run() + .context(concat!( + "Couldn't install the required wasi-experimental workload.\n", + "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights." + ))?; + } + Err(error) if error.kind() == std::io::ErrorKind::NotFound => { + anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.") + } + Err(error) => anyhow::bail!("{error}"), + }; + } let config_name = if build_debug { "Debug" } else { "Release" }; diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs index 6109b872314..11fc75bbac5 100644 --- a/tools/xtask-llm-benchmark/src/bench/publishers.rs +++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs @@ -334,6 +334,9 @@ impl DotnetPublisher { // when running multiple dotnet builds in parallel. .env("MSBUILDDISABLENODEREUSE", "1") .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0") + // The workflow installs the WASI workload before running benchmarks. + // Avoid `dotnet workload list`, which can segfault on the CI runner. + .env("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK", "1") } } From 77e2924cb3bc84b0bab69982e69f4e2f8eb300a9 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 23:13:27 -0400 Subject: [PATCH 21/25] Update llm-benchmark-periodic.yml --- .github/workflows/llm-benchmark-periodic.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml index bc456520724..da314290e71 100644 --- a/.github/workflows/llm-benchmark-periodic.yml +++ b/.github/workflows/llm-benchmark-periodic.yml @@ -81,16 +81,19 @@ jobs: dotnet pack -c Release crates/bindings-csharp/Runtime - name: Set up Node.js + if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }} uses: actions/setup-node@v4 with: node-version: 22 - name: Install pnpm + if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }} uses: ./.github/actions/setup-pnpm with: run_install: true - name: Build TypeScript SDK + if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }} run: pnpm build working-directory: crates/bindings-typescript From 63a9c34025db60a214a7c973d8e8bfa1d03ea8d2 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Fri, 12 Jun 2026 23:15:19 -0400 Subject: [PATCH 22/25] revert tests --- crates/cli/src/tasks/csharp.rs | 87 +++++++++---------- .../src/bench/publishers.rs | 3 - 2 files changed, 39 insertions(+), 51 deletions(-) diff --git a/crates/cli/src/tasks/csharp.rs b/crates/cli/src/tasks/csharp.rs index 7f76b3199d2..5df8b730448 100644 --- a/crates/cli/src/tasks/csharp.rs +++ b/crates/cli/src/tasks/csharp.rs @@ -1,6 +1,5 @@ use anyhow::Context; use itertools::Itertools; -use std::env; use std::ffi::OsString; use std::fs; use std::path::{Path, PathBuf}; @@ -9,12 +8,6 @@ fn parse_major_version(version: &str) -> Option { version.split('.').next()?.parse::().ok() } -fn skip_workload_check() -> bool { - env::var("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK") - .ok() - .is_some_and(|value| matches!(value.as_str(), "1" | "true" | "TRUE" | "yes" | "YES")) -} - pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Result { // All `dotnet` commands must execute in the project directory, otherwise // global.json won't have any effect and wrong .NET SDK might be picked. @@ -24,48 +17,46 @@ pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Re }; } - if !skip_workload_check() { - // Check if the `wasi-experimental` workload is installed. Unfortunately, we - // have to do this by inspecting the human-readable output. There is a - // hidden `--machine-readable` flag but it also mixes in human-readable - // output as well as unnecessarily updates various unrelated manifests. - match dotnet!("workload", "list").read() { - Ok(workloads) if workloads.contains("wasi-experimental") => {} - Ok(_) => { - // If wasi-experimental is not found, first check if we're running - // on .NET SDK 8.0. We can't even install that workload on older - // versions, and we don't support .NET 9.0 yet, so this helps to - // provide a nicer message than "Workload ID wasi-experimental is not recognized.". - let version = dotnet!("--version").read().unwrap_or_default(); - if parse_major_version(&version) != Some(8) { - anyhow::bail!(concat!( - ".NET SDK 8.0 is required, but found {version}.\n", - "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json." - )); - } - - // Finally, try to install the workload ourselves. On some systems - // this might require elevated privileges, so print a nice error - // message if it fails. - dotnet!( - "workload", - "install", - "wasi-experimental", - "--skip-manifest-update" - ) - .stderr_capture() - .run() - .context(concat!( - "Couldn't install the required wasi-experimental workload.\n", - "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights." - ))?; + // Check if the `wasi-experimental` workload is installed. Unfortunately, we + // have to do this by inspecting the human-readable output. There is a + // hidden `--machine-readable` flag but it also mixes in human-readable + // output as well as unnecessarily updates various unrelated manifests. + match dotnet!("workload", "list").read() { + Ok(workloads) if workloads.contains("wasi-experimental") => {} + Ok(_) => { + // If wasi-experimental is not found, first check if we're running + // on .NET SDK 8.0. We can't even install that workload on older + // versions, and we don't support .NET 9.0 yet, so this helps to + // provide a nicer message than "Workload ID wasi-experimental is not recognized.". + let version = dotnet!("--version").read().unwrap_or_default(); + if parse_major_version(&version) != Some(8) { + anyhow::bail!(concat!( + ".NET SDK 8.0 is required, but found {version}.\n", + "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json." + )); } - Err(error) if error.kind() == std::io::ErrorKind::NotFound => { - anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.") - } - Err(error) => anyhow::bail!("{error}"), - }; - } + + // Finally, try to install the workload ourselves. On some systems + // this might require elevated privileges, so print a nice error + // message if it fails. + dotnet!( + "workload", + "install", + "wasi-experimental", + "--skip-manifest-update" + ) + .stderr_capture() + .run() + .context(concat!( + "Couldn't install the required wasi-experimental workload.\n", + "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights." + ))?; + } + Err(error) if error.kind() == std::io::ErrorKind::NotFound => { + anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.") + } + Err(error) => anyhow::bail!("{error}"), + }; let config_name = if build_debug { "Debug" } else { "Release" }; diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs index 11fc75bbac5..6109b872314 100644 --- a/tools/xtask-llm-benchmark/src/bench/publishers.rs +++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs @@ -334,9 +334,6 @@ impl DotnetPublisher { // when running multiple dotnet builds in parallel. .env("MSBUILDDISABLENODEREUSE", "1") .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0") - // The workflow installs the WASI workload before running benchmarks. - // Avoid `dotnet workload list`, which can segfault on the CI runner. - .env("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK", "1") } } From 9596077046a37eef609ca2f74f6407bd05b2ef26 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Mon, 15 Jun 2026 09:06:15 -0400 Subject: [PATCH 23/25] preflight no error; vendor to openrouter in periodic --- .github/workflows/llm-benchmark-periodic.yml | 1 + tools/xtask-llm-benchmark/src/llm/clients/mod.rs | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml index da314290e71..da3af16b609 100644 --- a/.github/workflows/llm-benchmark-periodic.yml +++ b/.github/workflows/llm-benchmark-periodic.yml @@ -112,6 +112,7 @@ jobs: OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + LLM_VENDOR: openrouter LLM_BENCHMARK_API_KEY: ${{ secrets.LLM_BENCHMARK_API_KEY }} LLM_BENCHMARK_UPLOAD_URL: ${{ secrets.LLM_BENCHMARK_UPLOAD_URL }} DOTNET_MULTILEVEL_LOOKUP: "0" diff --git a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs index 83454c2677c..254fe5b8f63 100644 --- a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs +++ b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs @@ -8,7 +8,7 @@ pub mod openai; pub mod openrouter; pub mod xai; -use anyhow::{bail, Result}; +use anyhow::Result; use async_trait::async_trait; pub use anthropic::AnthropicClient; @@ -44,11 +44,11 @@ pub trait LlmClient: Send + Sync { fn provider_name(&self) -> &'static str; async fn preflight(&self, model: &str) -> Result { - bail!( - "{} credit preflight is not implemented for model '{}'", + Ok(ClientPreflight::new(format!( + "{} credit preflight not implemented for model '{}'; skipped", self.provider_name(), model - ) + ))) } async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result; From 65e4539f04875dc645d89fcb974aa81ff0d379e8 Mon Sep 17 00:00:00 2001 From: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Date: Mon, 15 Jun 2026 10:11:12 -0400 Subject: [PATCH 24/25] lints --- .../xtask-llm-benchmark/src/bench/publishers.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs index 6109b872314..92622972114 100644 --- a/tools/xtask-llm-benchmark/src/bench/publishers.rs +++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs @@ -482,15 +482,15 @@ impl Publisher for TypeScriptPublisher { if let Some(dir) = nodejs_dir { prepend_paths.push(dir); } - if let Some(ref pnpm) = pnpm_exe { - if let Some(parent) = pnpm.parent() { - prepend_paths.push(parent.to_path_buf()); - } + if let Some(ref pnpm) = pnpm_exe + && let Some(parent) = pnpm.parent() + { + prepend_paths.push(parent.to_path_buf()); } - if let Some(node) = node_exe { - if let Some(parent) = node.parent() { - prepend_paths.push(parent.to_path_buf()); - } + if let Some(node) = node_exe + && let Some(parent) = node.parent() + { + prepend_paths.push(parent.to_path_buf()); } let child_path = if !prepend_paths.is_empty() { let mut paths = path_entries(); From 23ab3f731a8afd61b93dc82180dc9b793cc10740 Mon Sep 17 00:00:00 2001 From: clockwork-labs-bot Date: Mon, 15 Jun 2026 23:47:53 -0400 Subject: [PATCH 25/25] Avoid .NET globalization crash in LLM benchmarks (#5335) # Description of Changes Sets `DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1` only on the benchmark harness command that publishes generated C# modules. This keeps dotnet startup out of localized DateTime/TimeZoneInfo formatting on the CI runner, which was crashing before generated C# module publish could run. Stacked on #5324. ```bash gh workflow run llm-benchmark-periodic.yml \ --repo ClockworkLabs/SpacetimeDB \ --ref bot/debug-llm-csharp-publish \ -f model_set=explicit \ -f models="openrouter:openai/gpt-5.4-mini" \ -f languages=rust,csharp,typescript \ -f modes=guidelines \ -f tasks=t_000_empty_reducers \ -f dry_run=true ``` # API and ABI breaking changes None. # Expected complexity level and risk 1. CI benchmark harness environment fix. # Testing - [x] `cargo fmt --all` - [x] `cargo check --manifest-path tools/xtask-llm-benchmark/Cargo.toml` - [x] `ruby -e 'require "yaml"; YAML.load_file(".github/workflows/llm-benchmark-periodic.yml"); YAML.load_file(".github/workflows/llm-benchmark-validate-goldens.yml")'`\n- [x] `git diff --check` --------- Co-authored-by: clockwork-labs-bot --- tools/xtask-llm-benchmark/src/bench/publishers.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs index 92622972114..b7fb74c6936 100644 --- a/tools/xtask-llm-benchmark/src/bench/publishers.rs +++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs @@ -330,6 +330,10 @@ impl DotnetPublisher { fn configure_dotnet_env(cmd: &mut Command) -> &mut Command { cmd.env("DOTNET_CLI_TELEMETRY_OPTOUT", "1") .env("DOTNET_NOLOGO", "1") + // The CI runner's .NET install can crash while formatting localized + // DateTime/TimeZoneInfo data before publish starts. Force invariant + // globalization so generated C# module publish reaches MSBuild. + .env("DOTNET_SYSTEM_GLOBALIZATION_INVARIANT", "1") // Prevent MSBuild node reuse issues that cause "Pipe is broken" errors // when running multiple dotnet builds in parallel. .env("MSBUILDDISABLENODEREUSE", "1")