From a6382caa2d5b47d7fdd7ac48b29f5870b2fcbe37 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Wed, 10 Jun 2026 15:05:20 -0400
Subject: [PATCH 01/25] updates

---
 .../llm-benchmark-validate-goldens.yml        |   8 +
 .../src/bin/llm_benchmark.rs                  |  81 ++++++--
 .../src/llm/clients/mod.rs                    |  76 ++++++++
 .../src/llm/clients/openrouter.rs             | 146 +++++++++++++-
 tools/xtask-llm-benchmark/src/llm/provider.rs | 181 +++++++++++-------
 5 files changed, 405 insertions(+), 87 deletions(-)
diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml
index 17384a654e3..591d55a6a59 100644
--- a/.github/workflows/llm-benchmark-validate-goldens.yml
+++ b/.github/workflows/llm-benchmark-validate-goldens.yml
@@ -57,6 +57,13 @@ jobs:
       - name: Install pnpm
         if: matrix.lang == 'typescript'
         uses: ./.github/actions/setup-pnpm
+        with:
+          run_install: true
+
+      - name: Build TypeScript SDK
+        if: matrix.lang == 'typescript'
+        run: pnpm build
+        working-directory: crates/bindings-typescript
 
       - name: Build llm-benchmark tool
         run: cargo install --path tools/xtask-llm-benchmark --locked
@@ -72,5 +79,6 @@ jobs:
         env:
           MSBUILDDISABLENODEREUSE: "1"
           DOTNET_CLI_USE_MSBUILD_SERVER: "0"
+          LLM_BENCH_CSHARP_CONCURRENCY: "1"
         run: |
           llm_benchmark run --goldens-only --lang ${{ matrix.lang }}
diff --git a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
index c624fdc4108..2a931fbaa4d 100644
--- a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
+++ b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
@@ -255,11 +255,6 @@ fn run_benchmarks(args: RunArgs) -> Result<()> {
         eprintln!("[warn] failed to upload task catalog: {e}");
     }
 
-    let modes = config
-        .modes
-        .clone()
-        .unwrap_or_else(|| ALL_MODES.iter().map(|s| s.to_string()).collect());
-
     let RuntimeInit {
         runtime,
         provider: llm_provider,
@@ -273,7 +268,29 @@ fn run_benchmarks(args: RunArgs) -> Result<()> {
     let selectors: Option<Vec<String>> = config.selectors.clone();
     let selectors_ref: Option<&[String]> = selectors.as_deref();
 
+    let modes = config
+        .modes
+        .clone()
+        .unwrap_or_else(|| ALL_MODES.iter().map(|s| s.to_string()).collect());
+
+    if config.goldens_only {
+        let rt = runtime.as_ref().expect("runtime required for --goldens-only");
+        rt.block_on(build_goldens_only_for_lang(
+            config.host.clone(),
+            &bench_root,
+            config.lang,
+            selectors_ref,
+        ))?;
+        println!("[{}] goldens-only build complete", config.lang.as_str());
+        return Ok(());
+    }
+
     if !config.goldens_only && !config.hash_only {
+        let rt = runtime.as_ref().expect("failed to initialize runtime for preflight");
+        let provider = llm_provider.as_ref().expect("llm provider required for preflight");
+        let routes = filter_routes(&config);
+        preflight_llm_routes(rt, provider.as_ref(), &routes, &modes)?;
+
         let rt = runtime.as_ref().expect("failed to initialize runtime for goldens");
         rt.block_on(ensure_goldens_built_once(
             config.host.clone(),
@@ -517,6 +534,51 @@ fn short_hash(s: &str) -> &str {
     &s[..s.len().min(12)]
 }
 
+fn preflight_llm_routes(
+    runtime: &Runtime,
+    llm_provider: &dyn LlmProvider,
+    routes: &[ModelRoute],
+    modes: &[String],
+) -> Result<()> {
+    if routes.is_empty() {
+        return Ok(());
+    }
+
+    let mut search_flags = Vec::new();
+    if modes.iter().any(|mode| mode == "search") {
+        search_flags.push(true);
+    }
+    if modes.iter().any(|mode| mode != "search") {
+        search_flags.push(false);
+    }
+
+    let mut failures = Vec::new();
+    for route in routes {
+        for search_enabled in &search_flags {
+            let mode_label = if *search_enabled {
+                "search/OpenRouter online"
+            } else {
+                "standard"
+            };
+
+            if let Err(err) = runtime.block_on(llm_provider.preflight_route(route, *search_enabled)) {
+                let msg = format!("{} ({mode_label}): {err:#}", route.display_name);
+                eprintln!("[preflight] FAILED {msg}");
+                failures.push(msg);
+            }
+        }
+    }
+
+    if !failures.is_empty() {
+        anyhow::bail!(
+            "LLM provider preflight failed before benchmark run:\n  - {}",
+            failures.join("\n  - ")
+        );
+    }
+
+    Ok(())
+}
+
 /// Run benchmarks for a single mode.
 fn run_mode_benchmarks(
     mode: &str,
@@ -538,15 +600,6 @@ fn run_mode_benchmarks(
         return Ok(Vec::new());
     }
 
-    if config.goldens_only {
-        let rt = runtime.expect("runtime required for --goldens-only");
-        let sels = config.selectors.as_deref();
-
-        rt.block_on(build_goldens_only_for_lang(config.host.clone(), bench_root, lang, sels))?;
-        println!("{:<12} [{:<10}] goldens-only build complete", mode, lang_str);
-        return Ok(Vec::new());
-    }
-
     // Run benchmarks for all matching routes
     let routes = filter_routes(config);
 
diff --git a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs
index 172beef8ff8..d8eba39c4d0 100644
--- a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs
+++ b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs
@@ -8,6 +8,9 @@ pub mod openai;
 pub mod openrouter;
 pub mod xai;
 
+use anyhow::{bail, Result};
+use async_trait::async_trait;
+
 pub use anthropic::AnthropicClient;
 pub use deepseek::DeepSeekClient;
 pub use google::GoogleGeminiClient;
@@ -15,3 +18,76 @@ pub use meta::MetaLlamaClient;
 pub use openai::OpenAiClient;
 pub use openrouter::OpenRouterClient;
 pub use xai::XaiGrokClient;
+
+use crate::llm::prompt::BuiltPrompt;
+use crate::llm::types::LlmOutput;
+
+#[derive(Debug, Clone)]
+pub struct ClientPreflight {
+    summary: String,
+}
+
+impl ClientPreflight {
+    pub fn new(summary: impl Into<String>) -> Self {
+        Self {
+            summary: summary.into(),
+        }
+    }
+
+    pub fn summary(&self) -> &str {
+        &self.summary
+    }
+}
+
+#[async_trait]
+pub trait LlmClient: Send + Sync {
+    fn provider_name(&self) -> &'static str;
+
+    async fn preflight(&self, model: &str) -> Result<ClientPreflight> {
+        bail!(
+            "{} credit preflight is not implemented for model '{}'",
+            self.provider_name(),
+            model
+        )
+    }
+
+    async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result<LlmOutput>;
+}
+
+macro_rules! impl_direct_llm_client {
+    ($ty:ty, $provider_name:literal) => {
+        #[async_trait]
+        impl LlmClient for $ty {
+            fn provider_name(&self) -> &'static str {
+                $provider_name
+            }
+
+            async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result<LlmOutput> {
+                <$ty>::generate(self, model, prompt).await
+            }
+        }
+    };
+}
+
+impl_direct_llm_client!(OpenAiClient, "OpenAI");
+impl_direct_llm_client!(AnthropicClient, "Anthropic");
+impl_direct_llm_client!(GoogleGeminiClient, "Google");
+impl_direct_llm_client!(XaiGrokClient, "xAI");
+impl_direct_llm_client!(DeepSeekClient, "DeepSeek");
+impl_direct_llm_client!(MetaLlamaClient, "Meta");
+
+#[async_trait]
+impl LlmClient for OpenRouterClient {
+    fn provider_name(&self) -> &'static str {
+        "OpenRouter"
+    }
+
+    async fn preflight(&self, _model: &str) -> Result<ClientPreflight> {
+        let status = self.preflight_credits().await?;
+        Ok(ClientPreflight::new(status.summary()))
+    }
+
+    async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result<LlmOutput> {
+        OpenRouterClient::generate(self, model, prompt).await
+    }
+}
diff --git a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs
index 623570298af..d35ce789d17 100644
--- a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs
+++ b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs
@@ -1,5 +1,6 @@
-use anyhow::{anyhow, Context, Result};
-use serde::Serialize;
+use anyhow::{anyhow, bail, Context, Result};
+use serde::{Deserialize, Serialize};
+use std::env;
 
 use super::http::HttpClient;
 use super::oa_compat::OACompatResp;
@@ -35,6 +36,81 @@ impl OpenRouterClient {
         Self { base, api_key, http }
     }
 
+    pub async fn preflight_credits(&self) -> Result<OpenRouterCreditStatus> {
+        let key_info = self.fetch_key_info().await?;
+        let min_credits = min_credits_threshold();
+
+        if let Some(remaining) = key_info.limit_remaining
+            && remaining <= min_credits
+        {
+            bail!(
+                "OpenRouter API key has insufficient remaining credits: {:.4} <= {:.4}",
+                remaining,
+                min_credits
+            );
+        }
+
+        let account = match env::var("OPENROUTER_MANAGEMENT_API_KEY")
+            .ok()
+            .filter(|v| !v.trim().is_empty())
+        {
+            Some(key) => Some(self.fetch_account_credits(&key).await?),
+            None => None,
+        };
+
+        if let Some(account) = &account
+            && account.remaining <= min_credits
+        {
+            bail!(
+                "OpenRouter account has insufficient remaining credits: {:.4} <= {:.4}",
+                account.remaining,
+                min_credits
+            );
+        }
+
+        if account.is_none() && key_info.limit_remaining.is_none() {
+            bail!(
+                "OpenRouter API key has no configured credit limit and account credits were not checked. \
+                 Set OPENROUTER_MANAGEMENT_API_KEY for account balance preflight."
+            );
+        }
+
+        Ok(OpenRouterCreditStatus {
+            key_limit: key_info.limit,
+            key_limit_remaining: key_info.limit_remaining,
+            account_remaining: account.map(|a| a.remaining),
+            min_credits,
+        })
+    }
+
+    async fn fetch_key_info(&self) -> Result<OpenRouterKeyInfo> {
+        let url = format!("{}/key", self.base.trim_end_matches('/'));
+        let auth = HttpClient::bearer(&self.api_key);
+        let body = self
+            .http
+            .get_text(&url, &[auth])
+            .await
+            .with_context(|| format!("OpenRouter key preflight GET {}", url))?;
+
+        let resp: OpenRouterKeyResp = serde_json::from_str(&body).context("parse OpenRouter key response")?;
+        Ok(resp.data)
+    }
+
+    async fn fetch_account_credits(&self, management_key: &str) -> Result<OpenRouterAccountCredits> {
+        let url = format!("{}/credits", self.base.trim_end_matches('/'));
+        let auth = HttpClient::bearer(management_key);
+        let body = self
+            .http
+            .get_text(&url, &[auth])
+            .await
+            .with_context(|| format!("OpenRouter account credit preflight GET {}", url))?;
+
+        let resp: OpenRouterCreditsResp = serde_json::from_str(&body).context("parse OpenRouter credits response")?;
+        Ok(OpenRouterAccountCredits {
+            remaining: resp.data.total_credits - resp.data.total_usage,
+        })
+    }
+
     pub async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result<LlmOutput> {
         let url = format!("{}/chat/completions", self.base.trim_end_matches('/'));
 
@@ -125,6 +201,72 @@ impl OpenRouterClient {
     }
 }
 
+#[derive(Debug, Clone)]
+pub struct OpenRouterCreditStatus {
+    pub key_limit: Option<f64>,
+    pub key_limit_remaining: Option<f64>,
+    pub account_remaining: Option<f64>,
+    pub min_credits: f64,
+}
+
+impl OpenRouterCreditStatus {
+    pub fn summary(&self) -> String {
+        let key_remaining = match (self.key_limit, self.key_limit_remaining) {
+            (Some(limit), Some(remaining)) => format!("key remaining {remaining:.4}/{limit:.4}"),
+            (Some(limit), None) => format!("key limit {limit:.4}, remaining unknown"),
+            (None, Some(remaining)) => format!("key remaining {remaining:.4}"),
+            (None, None) => "key has no configured limit".to_string(),
+        };
+
+        match self.account_remaining {
+            Some(remaining) => {
+                format!(
+                    "{key_remaining}; account remaining {remaining:.4}; min {:.4}",
+                    self.min_credits
+                )
+            }
+            None => format!(
+                "{key_remaining}; account balance not checked (set OPENROUTER_MANAGEMENT_API_KEY); min {:.4}",
+                self.min_credits
+            ),
+        }
+    }
+}
+
+#[derive(Debug, Deserialize)]
+struct OpenRouterKeyResp {
+    data: OpenRouterKeyInfo,
+}
+
+#[derive(Debug, Deserialize)]
+struct OpenRouterKeyInfo {
+    limit: Option<f64>,
+    limit_remaining: Option<f64>,
+}
+
+#[derive(Debug, Deserialize)]
+struct OpenRouterCreditsResp {
+    data: OpenRouterCreditsData,
+}
+
+#[derive(Debug, Deserialize)]
+struct OpenRouterCreditsData {
+    total_credits: f64,
+    total_usage: f64,
+}
+
+#[derive(Debug, Clone)]
+struct OpenRouterAccountCredits {
+    remaining: f64,
+}
+
+fn min_credits_threshold() -> f64 {
+    env::var("LLM_MIN_CREDITS")
+        .ok()
+        .and_then(|v| v.trim().parse::<f64>().ok())
+        .unwrap_or(0.0)
+}
+
 /// Context limits for models accessed via OpenRouter.
 /// Uses the same limits as direct clients where known,
 /// falls back to a conservative default.
diff --git a/tools/xtask-llm-benchmark/src/llm/provider.rs b/tools/xtask-llm-benchmark/src/llm/provider.rs
index 65d587d9526..fd906f62773 100644
--- a/tools/xtask-llm-benchmark/src/llm/provider.rs
+++ b/tools/xtask-llm-benchmark/src/llm/provider.rs
@@ -2,7 +2,8 @@ use anyhow::{Context, Result};
 use async_trait::async_trait;
 
 use crate::llm::clients::{
-    AnthropicClient, DeepSeekClient, GoogleGeminiClient, MetaLlamaClient, OpenAiClient, OpenRouterClient, XaiGrokClient,
+    AnthropicClient, DeepSeekClient, GoogleGeminiClient, LlmClient, MetaLlamaClient, OpenAiClient, OpenRouterClient,
+    XaiGrokClient,
 };
 use crate::llm::model_routes::ModelRoute;
 use crate::llm::prompt::BuiltPrompt;
@@ -10,6 +11,7 @@ use crate::llm::types::{LlmOutput, Vendor};
 
 #[async_trait]
 pub trait LlmProvider: Send + Sync {
+    async fn preflight_route(&self, route: &ModelRoute, search_enabled: bool) -> Result<()>;
     async fn generate(&self, route: &ModelRoute, prompt: &BuiltPrompt) -> Result<LlmOutput>;
 }
 
@@ -51,98 +53,135 @@ impl RouterProvider {
     }
 }
 
+struct ResolvedClient<'a> {
+    client: &'a dyn LlmClient,
+    endpoint_name: &'static str,
+    model: String,
+    fallback_from: Option<&'static str>,
+    search_enabled: bool,
+}
+
 #[async_trait]
 impl LlmProvider for RouterProvider {
+    async fn preflight_route(&self, route: &ModelRoute, search_enabled: bool) -> Result<()> {
+        let resolved = self.resolve_client(route, search_enabled)?;
+        let status = resolved.client.preflight(&resolved.model).await.with_context(|| {
+            format!(
+                "{} credit preflight failed for model '{}'",
+                resolved.endpoint_name, resolved.model
+            )
+        })?;
+
+        eprintln!(
+            "[preflight] {} -> {} '{}' OK ({})",
+            route.display_name,
+            resolved.endpoint_name,
+            resolved.model,
+            status.summary()
+        );
+        Ok(())
+    }
+
     async fn generate(&self, route: &ModelRoute, prompt: &BuiltPrompt) -> Result<LlmOutput> {
-        // Web search mode: route all models through OpenRouter with :online suffix.
-        // OpenRouter's :online feature adds Bing-powered web search to any model.
-        if prompt.search_enabled {
-            let cli = self.openrouter.as_ref().context(
-                "Search mode requires OPENROUTER_API_KEY — OpenRouter provides unified web search via :online models",
-            )?;
+        let resolved = self.resolve_client(route, prompt.search_enabled)?;
+
+        if resolved.search_enabled {
+            eprintln!(
+                "[search] {} -> OpenRouter :online model '{}'",
+                route.display_name, resolved.model
+            );
+        } else if let Some(vendor_name) = resolved.fallback_from {
+            eprintln!(
+                "[openrouter] {} client not configured, falling back to OpenRouter for model '{}'",
+                vendor_name, resolved.model
+            );
+        }
+
+        resolved.client.generate(&resolved.model, prompt).await
+    }
+}
+
+impl RouterProvider {
+    fn resolve_client<'a>(&'a self, route: &ModelRoute, search_enabled: bool) -> Result<ResolvedClient<'a>> {
+        if search_enabled {
             let base_model = route
                 .openrouter_model
                 .clone()
                 .unwrap_or_else(|| openrouter_model_id(route.vendor, &route.api_model));
-            let online_model = format!("{base_model}:online");
-            eprintln!(
-                "[search] {} → OpenRouter :online model '{}'",
-                route.display_name, online_model
-            );
-            return cli.generate(&online_model, prompt).await;
+            return self.resolve_openrouter(format!("{base_model}:online"), None, true);
         }
 
         let vendor = self.force.unwrap_or(route.vendor);
 
-        // If vendor is explicitly OpenRouter, or if the direct client isn't configured
-        // but OpenRouter is available, route through OpenRouter.
         if vendor == Vendor::OpenRouter {
-            let cli = self
-                .openrouter
-                .as_ref()
-                .context("OpenRouter client not configured (set OPENROUTER_API_KEY)")?;
             let model = route.openrouter_model.as_deref().unwrap_or(&route.api_model);
-            return cli.generate(model, prompt).await;
+            return self.resolve_openrouter(model.to_string(), None, false);
         }
 
-        // Try direct client first, fall back to OpenRouter if available.
         match vendor {
-            Vendor::OpenAi => match self.openai.as_ref() {
-                Some(cli) => cli.generate(&route.api_model, prompt).await,
-                None => self.fallback_openrouter(route, prompt, "OpenAI").await,
-            },
-            Vendor::Anthropic => match self.anthropic.as_ref() {
-                Some(cli) => cli.generate(&route.api_model, prompt).await,
-                None => self.fallback_openrouter(route, prompt, "Anthropic").await,
-            },
-            Vendor::Google => match self.google.as_ref() {
-                Some(cli) => cli.generate(&route.api_model, prompt).await,
-                None => self.fallback_openrouter(route, prompt, "Google").await,
-            },
-            Vendor::Xai => match self.xai.as_ref() {
-                Some(cli) => cli.generate(&route.api_model, prompt).await,
-                None => self.fallback_openrouter(route, prompt, "xAI").await,
-            },
-            Vendor::DeepSeek => match self.deepseek.as_ref() {
-                Some(cli) => cli.generate(&route.api_model, prompt).await,
-                None => self.fallback_openrouter(route, prompt, "DeepSeek").await,
-            },
-            Vendor::Meta => match self.meta.as_ref() {
-                Some(cli) => cli.generate(&route.api_model, prompt).await,
-                None => self.fallback_openrouter(route, prompt, "Meta").await,
-            },
+            Vendor::OpenAi => {
+                self.resolve_direct_or_openrouter(self.openai.as_ref().map(|c| c as &dyn LlmClient), route, vendor)
+            }
+            Vendor::Anthropic => {
+                self.resolve_direct_or_openrouter(self.anthropic.as_ref().map(|c| c as &dyn LlmClient), route, vendor)
+            }
+            Vendor::Google => {
+                self.resolve_direct_or_openrouter(self.google.as_ref().map(|c| c as &dyn LlmClient), route, vendor)
+            }
+            Vendor::Xai => {
+                self.resolve_direct_or_openrouter(self.xai.as_ref().map(|c| c as &dyn LlmClient), route, vendor)
+            }
+            Vendor::DeepSeek => {
+                self.resolve_direct_or_openrouter(self.deepseek.as_ref().map(|c| c as &dyn LlmClient), route, vendor)
+            }
+            Vendor::Meta => {
+                self.resolve_direct_or_openrouter(self.meta.as_ref().map(|c| c as &dyn LlmClient), route, vendor)
+            }
             Vendor::OpenRouter => unreachable!("handled above"),
         }
     }
-}
 
-impl RouterProvider {
-    /// Fall back to the OpenRouter client when a direct vendor client is not configured.
-    async fn fallback_openrouter(
-        &self,
+    fn resolve_direct_or_openrouter<'a>(
+        &'a self,
+        direct: Option<&'a dyn LlmClient>,
         route: &ModelRoute,
-        prompt: &BuiltPrompt,
-        vendor_name: &str,
-    ) -> Result<LlmOutput> {
-        match self.openrouter.as_ref() {
-            Some(cli) => {
-                let or_model = route
-                    .openrouter_model
-                    .clone()
-                    .unwrap_or_else(|| openrouter_model_id(route.vendor, &route.api_model));
-                eprintln!(
-                    "[openrouter] {} client not configured, falling back to OpenRouter for model '{}'",
-                    vendor_name, or_model
-                );
-                cli.generate(&or_model, prompt).await
-            }
-            None => anyhow::bail!(
-                "{} client not configured and no OpenRouter fallback available. \
-                 Set {}_API_KEY or OPENROUTER_API_KEY.",
-                vendor_name,
-                vendor_name.to_ascii_uppercase()
-            ),
+        vendor: Vendor,
+    ) -> Result<ResolvedClient<'a>> {
+        if let Some(client) = direct {
+            return Ok(ResolvedClient {
+                client,
+                endpoint_name: vendor.display_name(),
+                model: route.api_model.clone(),
+                fallback_from: None,
+                search_enabled: false,
+            });
         }
+
+        let model = route
+            .openrouter_model
+            .clone()
+            .unwrap_or_else(|| openrouter_model_id(route.vendor, &route.api_model));
+        self.resolve_openrouter(model, Some(vendor.display_name()), false)
+    }
+
+    fn resolve_openrouter<'a>(
+        &'a self,
+        model: String,
+        fallback_from: Option<&'static str>,
+        search_enabled: bool,
+    ) -> Result<ResolvedClient<'a>> {
+        let client = self
+            .openrouter
+            .as_ref()
+            .context("OpenRouter client not configured (set OPENROUTER_API_KEY)")?;
+
+        Ok(ResolvedClient {
+            client,
+            endpoint_name: "OpenRouter",
+            model,
+            fallback_from,
+            search_enabled,
+        })
     }
 }
 

From 711ff882a2cb421c6919f629cb5e2eaee594d4f7 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Thu, 11 Jun 2026 10:01:49 -0400
Subject: [PATCH 02/25] Update provider.rs

---
 tools/xtask-llm-benchmark/src/llm/provider.rs | 72 ++++++++-----------
 1 file changed, 31 insertions(+), 41 deletions(-)

diff --git a/tools/xtask-llm-benchmark/src/llm/provider.rs b/tools/xtask-llm-benchmark/src/llm/provider.rs
index fd906f62773..355f2e19a3e 100644
--- a/tools/xtask-llm-benchmark/src/llm/provider.rs
+++ b/tools/xtask-llm-benchmark/src/llm/provider.rs
@@ -1,5 +1,6 @@
 use anyhow::{Context, Result};
 use async_trait::async_trait;
+use std::collections::HashMap;
 
 use crate::llm::clients::{
     AnthropicClient, DeepSeekClient, GoogleGeminiClient, LlmClient, MetaLlamaClient, OpenAiClient, OpenRouterClient,
@@ -16,15 +17,7 @@ pub trait LlmProvider: Send + Sync {
 }
 
 pub struct RouterProvider {
-    pub openai: Option<OpenAiClient>,
-    pub anthropic: Option<AnthropicClient>,
-    pub google: Option<GoogleGeminiClient>,
-    pub xai: Option<XaiGrokClient>,
-    pub deepseek: Option<DeepSeekClient>,
-    pub meta: Option<MetaLlamaClient>,
-    /// OpenRouter client used as a unified fallback when a direct vendor client
-    /// is not configured. Set via `OPENROUTER_API_KEY`.
-    pub openrouter: Option<OpenRouterClient>,
+    clients: HashMap<Vendor, Box<dyn LlmClient>>,
     pub force: Option<Vendor>,
 }
 
@@ -40,16 +33,31 @@ impl RouterProvider {
         openrouter: Option<OpenRouterClient>,
         force: Option<Vendor>,
     ) -> Self {
-        Self {
-            openai,
-            anthropic,
-            google,
-            xai,
-            deepseek,
-            meta,
-            openrouter,
-            force,
+        let mut clients: HashMap<Vendor, Box<dyn LlmClient>> = HashMap::new();
+
+        if let Some(client) = openai {
+            clients.insert(Vendor::OpenAi, Box::new(client));
+        }
+        if let Some(client) = anthropic {
+            clients.insert(Vendor::Anthropic, Box::new(client));
+        }
+        if let Some(client) = google {
+            clients.insert(Vendor::Google, Box::new(client));
+        }
+        if let Some(client) = xai {
+            clients.insert(Vendor::Xai, Box::new(client));
+        }
+        if let Some(client) = deepseek {
+            clients.insert(Vendor::DeepSeek, Box::new(client));
         }
+        if let Some(client) = meta {
+            clients.insert(Vendor::Meta, Box::new(client));
+        }
+        if let Some(client) = openrouter {
+            clients.insert(Vendor::OpenRouter, Box::new(client));
+        }
+
+        Self { clients, force }
     }
 }
 
@@ -118,27 +126,8 @@ impl RouterProvider {
             return self.resolve_openrouter(model.to_string(), None, false);
         }
 
-        match vendor {
-            Vendor::OpenAi => {
-                self.resolve_direct_or_openrouter(self.openai.as_ref().map(|c| c as &dyn LlmClient), route, vendor)
-            }
-            Vendor::Anthropic => {
-                self.resolve_direct_or_openrouter(self.anthropic.as_ref().map(|c| c as &dyn LlmClient), route, vendor)
-            }
-            Vendor::Google => {
-                self.resolve_direct_or_openrouter(self.google.as_ref().map(|c| c as &dyn LlmClient), route, vendor)
-            }
-            Vendor::Xai => {
-                self.resolve_direct_or_openrouter(self.xai.as_ref().map(|c| c as &dyn LlmClient), route, vendor)
-            }
-            Vendor::DeepSeek => {
-                self.resolve_direct_or_openrouter(self.deepseek.as_ref().map(|c| c as &dyn LlmClient), route, vendor)
-            }
-            Vendor::Meta => {
-                self.resolve_direct_or_openrouter(self.meta.as_ref().map(|c| c as &dyn LlmClient), route, vendor)
-            }
-            Vendor::OpenRouter => unreachable!("handled above"),
-        }
+        let direct = self.clients.get(&vendor).map(|client| client.as_ref());
+        self.resolve_direct_or_openrouter(direct, route, vendor)
     }
 
     fn resolve_direct_or_openrouter<'a>(
@@ -171,8 +160,9 @@ impl RouterProvider {
         search_enabled: bool,
     ) -> Result<ResolvedClient<'a>> {
         let client = self
-            .openrouter
-            .as_ref()
+            .clients
+            .get(&Vendor::OpenRouter)
+            .map(|client| client.as_ref())
             .context("OpenRouter client not configured (set OPENROUTER_API_KEY)")?;
 
         Ok(ResolvedClient {

From e82f0aef0f4e37b220b95f8d16cf14bc2a493eaf Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Thu, 11 Jun 2026 14:45:48 -0400
Subject: [PATCH 03/25] updates

---
 .github/workflows/llm-benchmark-periodic.yml  |  25 +-
 tools/xtask-llm-benchmark/src/api/client.rs   | 327 +++++++++++++++++-
 .../xtask-llm-benchmark/src/bench/analysis.rs |   6 +-
 .../src/bench/publishers.rs                   |   2 +
 tools/xtask-llm-benchmark/src/bench/runner.rs |   5 +-
 tools/xtask-llm-benchmark/src/bench/types.rs  |   5 +
 .../src/bin/llm_benchmark.rs                  | 282 +++++++++++++--
 .../src/llm/clients/anthropic.rs              |   6 +
 .../src/llm/clients/openrouter.rs             |  24 +-
 .../src/llm/model_routes.rs                   |  43 ++-
 .../src/llm/segmentation.rs                   |  38 +-
 11 files changed, 685 insertions(+), 78 deletions(-)

diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml
index 40ad2c75fe4..c0d9cc93d8c 100644
--- a/.github/workflows/llm-benchmark-periodic.yml
+++ b/.github/workflows/llm-benchmark-periodic.yml
@@ -2,11 +2,14 @@ name: Periodic LLM benchmarks
 
 on:
   schedule:
-    # Daily at midnight UTC. Change to '0 */6 * * *' for every 6h,
-    # or '0 */4 * * *' for every 4h.
-    - cron: '0 0 * * *'
+    # Weekly on Monday at midnight UTC.
+    - cron: '0 0 * * 1'
   workflow_dispatch:
     inputs:
+      run_id:
+        description: 'Website-created benchmark run id. When set, run spec is fetched from the website.'
+        required: false
+        default: ''
       models:
         description: 'Models to run (provider:model format, comma-separated, or "all")'
         required: false
@@ -62,6 +65,12 @@ jobs:
 
       - name: Install pnpm
         uses: ./.github/actions/setup-pnpm
+        with:
+          run_install: true
+
+      - name: Build TypeScript SDK
+        run: pnpm build
+        working-directory: crates/bindings-typescript
 
       - name: Build llm-benchmark tool
         run: cargo install --path tools/xtask-llm-benchmark --locked
@@ -82,19 +91,27 @@ jobs:
           LLM_BENCHMARK_UPLOAD_URL: ${{ secrets.LLM_BENCHMARK_UPLOAD_URL }}
           MSBUILDDISABLENODEREUSE: "1"
           DOTNET_CLI_USE_MSBUILD_SERVER: "0"
+          LLM_BENCH_CSHARP_CONCURRENCY: "1"
+          INPUT_RUN_ID: ${{ inputs.run_id || '' }}
           INPUT_LANGUAGES: ${{ inputs.languages || 'rust,csharp,typescript' }}
           INPUT_MODELS: ${{ inputs.models || 'all' }}
           INPUT_MODES: ${{ inputs.modes || 'guidelines,no_context' }}
         run: |
+          RUN_ID="$INPUT_RUN_ID"
           LANGS="$INPUT_LANGUAGES"
           MODELS="$INPUT_MODELS"
           MODES="$INPUT_MODES"
 
+          if [ -n "$RUN_ID" ]; then
+            llm_benchmark run-from-api --run-id "$RUN_ID"
+            exit $?
+          fi
+
           SUCCEEDED=0
           FAILED=0
           for LANG in $(echo "$LANGS" | tr ',' ' '); do
             if [ "$MODELS" = "all" ]; then
-              if llm_benchmark run --lang "$LANG" --modes "$MODES"; then
+              if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote; then
                 SUCCEEDED=$((SUCCEEDED + 1))
               else
                 echo "::warning::Benchmark run failed for lang=$LANG"
diff --git a/tools/xtask-llm-benchmark/src/api/client.rs b/tools/xtask-llm-benchmark/src/api/client.rs
index edc61756152..13fde0f8e81 100644
--- a/tools/xtask-llm-benchmark/src/api/client.rs
+++ b/tools/xtask-llm-benchmark/src/api/client.rs
@@ -1,14 +1,155 @@
-use anyhow::{Context, Result};
+use anyhow::{anyhow, Context, Result};
+use serde::Deserialize;
 use serde_json::json;
+use std::str::FromStr;
 
 use crate::bench::normalize::{canonical_mode, normalize_model_names};
 use crate::bench::types::{Results, RunOutcome};
+use crate::eval::Lang;
+use crate::llm::types::Vendor;
+use crate::llm::ModelRoute;
+
+#[derive(Debug, Clone)]
+pub struct RemoteRunSpec {
+    pub run_id: String,
+    pub languages: Vec<Lang>,
+    pub modes: Vec<String>,
+    pub routes: Vec<ModelRoute>,
+    pub categories: Option<Vec<String>>,
+    pub tasks: Option<Vec<String>>,
+}
+
+#[derive(Debug, Deserialize)]
+struct RemoteModelRouteRow {
+    #[serde(alias = "displayName", alias = "name")]
+    display_name: String,
+    vendor: String,
+    #[serde(alias = "apiModel")]
+    api_model: String,
+    #[serde(default, alias = "openrouterModel")]
+    openrouter_model: Option<String>,
+    #[serde(default)]
+    active: Option<bool>,
+    #[serde(default)]
+    available: Option<bool>,
+}
+
+#[derive(Debug, Deserialize)]
+struct RawRunSpec {
+    #[serde(default)]
+    id: Option<String>,
+    #[serde(default, alias = "runId")]
+    run_id: Option<String>,
+    languages: Vec<String>,
+    modes: Vec<String>,
+    #[serde(default, alias = "routes")]
+    models: Vec<RemoteModelRouteRow>,
+    #[serde(default)]
+    categories: Option<Vec<String>>,
+    #[serde(default)]
+    tasks: Option<Vec<String>>,
+}
+
+fn parse_model_route_row(row: RemoteModelRouteRow) -> Result<Option<ModelRoute>> {
+    if row.active == Some(false) || row.available == Some(false) {
+        return Ok(None);
+    }
+
+    let vendor = Vendor::parse(&row.vendor).ok_or_else(|| anyhow!("unknown model vendor '{}'", row.vendor))?;
+    let display_name = row.display_name.trim();
+    let api_model = row.api_model.trim();
+
+    if display_name.is_empty() {
+        anyhow::bail!("remote model row is missing display_name");
+    }
+    if api_model.is_empty() {
+        anyhow::bail!("remote model row '{}' is missing api_model", display_name);
+    }
+
+    Ok(Some(ModelRoute::new(
+        display_name,
+        vendor,
+        api_model,
+        row.openrouter_model.as_deref().filter(|s| !s.trim().is_empty()),
+    )))
+}
+
+pub fn parse_model_routes_response(body: &serde_json::Value) -> Result<Vec<ModelRoute>> {
+    let models = body.get("models").unwrap_or(body);
+    let rows: Vec<RemoteModelRouteRow> =
+        serde_json::from_value(models.clone()).context("parse llm benchmark model rows")?;
+
+    let mut routes = Vec::new();
+    for row in rows {
+        if let Some(route) = parse_model_route_row(row)? {
+            routes.push(route);
+        }
+    }
+
+    if routes.is_empty() {
+        anyhow::bail!("no active available LLM benchmark models returned by website");
+    }
+
+    Ok(routes)
+}
+
+pub fn parse_run_spec_response(body: &serde_json::Value, fallback_run_id: &str) -> Result<RemoteRunSpec> {
+    let spec = body.get("spec").or_else(|| body.get("spec_json")).unwrap_or(body);
+    let spec = match spec.as_str() {
+        Some(s) => serde_json::from_str::<serde_json::Value>(s).context("parse run spec_json string")?,
+        None => spec.clone(),
+    };
+
+    let raw: RawRunSpec = serde_json::from_value(spec).context("parse llm benchmark run spec")?;
+    let run_id = raw.run_id.or(raw.id).unwrap_or_else(|| fallback_run_id.to_string());
+
+    let languages = raw
+        .languages
+        .iter()
+        .map(|lang| Lang::from_str(lang).map_err(|e| anyhow!(e)))
+        .collect::<Result<Vec<_>>>()?;
+    if languages.is_empty() {
+        anyhow::bail!("run spec '{}' has no languages", run_id);
+    }
+
+    let modes: Vec<String> = raw
+        .modes
+        .into_iter()
+        .map(|mode| mode.trim().to_string())
+        .filter(|mode| !mode.is_empty())
+        .collect();
+    if modes.is_empty() {
+        anyhow::bail!("run spec '{}' has no modes", run_id);
+    }
+
+    let mut routes = Vec::new();
+    for row in raw.models {
+        if let Some(route) = parse_model_route_row(row)? {
+            routes.push(route);
+        }
+    }
+    if routes.is_empty() {
+        anyhow::bail!("run spec '{}' has no active available models", run_id);
+    }
+
+    Ok(RemoteRunSpec {
+        run_id,
+        languages,
+        modes,
+        routes,
+        categories: raw.categories,
+        tasks: raw.tasks,
+    })
+}
 
 /// HTTP client for the SpacetimeDB LLM benchmark API (spacetime-web Postgres).
 ///
-/// Supports two POST endpoints that already exist in spacetime-web:
-/// - `POST /api/llm-benchmark-upload` — upload benchmark results
-/// - `POST /api/llm-benchmark-tasks` — upload task catalog
+/// Supports endpoints owned by spacetime-web:
+/// - `POST /api/llm-benchmark-upload` - upload benchmark results
+/// - `POST /api/llm-benchmark-tasks` - upload task catalog
+/// - `GET /api/llm-benchmark-models?active=true` - fetch active benchmark models
+/// - `GET /api/llm-benchmark-runs/{run_id}` - fetch admin-triggered run specs
+/// - `PATCH /api/llm-benchmark-runs/{run_id}` - update admin-triggered run status
 #[derive(Clone)]
 pub struct ApiClient {
     client: reqwest::blocking::Client,
@@ -44,7 +185,13 @@ impl ApiClient {
     /// Upload a batch of run outcomes for a single (lang, mode) combination.
     /// Normalizes model names and sanitizes volatile fields before upload.
     /// If `analysis` is provided, it is stored in the `llm_benchmark_analysis` table.
-    pub fn upload_batch(&self, mode: &str, outcomes: &[RunOutcome], analysis: Option<&str>) -> Result<usize> {
+    pub fn upload_batch(
+        &self,
+        mode: &str,
+        outcomes: &[RunOutcome],
+        analysis: Option<&str>,
+        run_id: Option<&str>,
+    ) -> Result<usize> {
         if outcomes.is_empty() {
             return Ok(0);
         }
@@ -85,12 +232,15 @@ impl ApiClient {
                     }
                 }
 
-                let payload = json!({
+                let mut payload = json!({
                     "lang": lang_entry.lang,
                     "mode": mode_entry.mode,
                     "hash": mode_entry.hash,
                     "models": models_json,
                 });
+                if let Some(run_id) = run_id {
+                    payload["run_id"] = json!(run_id);
+                }
 
                 let resp = self
                     .client
@@ -113,7 +263,7 @@ impl ApiClient {
                     let status = resp.status();
                     let body = resp.text().unwrap_or_default();
                     anyhow::bail!(
-                        "upload failed for {}/{}: {} — {}",
+                        "upload failed for {}/{}: {} - {}",
                         lang_entry.lang,
                         mode_entry.mode,
                         status,
@@ -126,6 +276,100 @@ impl ApiClient {
         Ok(total_uploaded)
     }
 
+    /// Fetch active/available benchmark models from the website model registry.
+    pub fn fetch_model_routes(&self) -> Result<Vec<ModelRoute>> {
+        let url = format!("{}/api/llm-benchmark-models?active=true", self.base_url);
+        let resp = self
+            .client
+            .get(&url)
+            .header("Authorization", format!("Bearer {}", self.api_key))
+            .send()
+            .context("fetch LLM benchmark models failed")?;
+
+        if resp.status().is_success() {
+            let body: serde_json::Value = resp.json().context("parse model registry response")?;
+            parse_model_routes_response(&body)
+        } else {
+            let status = resp.status();
+            let body = resp.text().unwrap_or_default();
+            anyhow::bail!("fetch LLM benchmark models failed: {} - {}", status, body);
+        }
+    }
+
+    /// Fetch an immutable website-created run spec for admin-triggered runs.
+    pub fn fetch_run_spec(&self, run_id: &str) -> Result<RemoteRunSpec> {
+        let run_id_path = urlencoding::encode(run_id);
+        let url = format!("{}/api/llm-benchmark-runs/{}", self.base_url, run_id_path);
+        let resp = self
+            .client
+            .get(&url)
+            .header("Authorization", format!("Bearer {}", self.api_key))
+            .send()
+            .with_context(|| format!("fetch LLM benchmark run spec failed for {run_id}"))?;
+
+        if resp.status().is_success() {
+            let body: serde_json::Value = resp.json().context("parse run spec response")?;
+            parse_run_spec_response(&body, run_id)
+        } else {
+            let status = resp.status();
+            let body = resp.text().unwrap_or_default();
+            anyhow::bail!(
+                "fetch LLM benchmark run spec failed for {}: {} - {}",
+                run_id,
+                status,
+                body
+            );
+        }
+    }
+
+    /// Update website-created benchmark run status.
+    pub fn update_run_status(&self, run_id: &str, status: &str, error: Option<&str>) -> Result<()> {
+        let run_id_path = urlencoding::encode(run_id);
+        let url = format!("{}/api/llm-benchmark-runs/{}", self.base_url, run_id_path);
+        let mut payload = json!({
+            "status": status,
+        });
+        if let Some(error) = error {
+            payload["error"] = json!(error);
+        }
+        if let Ok(github_run_id) = std::env::var("GITHUB_RUN_ID")
+            && !github_run_id.is_empty()
+        {
+            payload["github_run_id"] = json!(github_run_id);
+            if let (Ok(server_url), Ok(repo)) = (std::env::var("GITHUB_SERVER_URL"), std::env::var("GITHUB_REPOSITORY"))
+            {
+                payload["github_run_url"] = json!(format!(
+                    "{}/{}/actions/runs/{}",
+                    server_url.trim_end_matches('/'),
+                    repo,
+                    payload["github_run_id"].as_str().unwrap_or_default()
+                ));
+            }
+        }
+
+        let resp = self
+            .client
+            .patch(&url)
+            .header("Authorization", format!("Bearer {}", self.api_key))
+            .header("Content-Type", "application/json")
+            .json(&payload)
+            .send()
+            .with_context(|| format!("update LLM benchmark run status failed for {run_id}"))?;
+
+        if resp.status().is_success() {
+            Ok(())
+        } else {
+            let status_code = resp.status();
+            let body = resp.text().unwrap_or_default();
+            anyhow::bail!(
+                "update LLM benchmark run status failed for {}: {} - {}",
+                run_id,
+                status_code,
+                body
+            );
+        }
+    }
+
     /// Upload the task catalog to `POST /api/llm-benchmark-tasks`, derived from
     /// the benchmarks directory structure on disk.
     pub fn upload_task_catalog(&self, bench_root: &std::path::Path) -> Result<usize> {
@@ -334,3 +578,72 @@ impl ApiClient {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parses_active_available_model_routes() {
+        let body = json!({
+            "models": [
+                {
+                    "displayName": "GPT Test",
+                    "vendor": "openai",
+                    "apiModel": "gpt-test",
+                    "openrouterModel": "openai/gpt-test",
+                    "active": true,
+                    "available": true
+                },
+                {
+                    "displayName": "Inactive",
+                    "vendor": "openai",
+                    "apiModel": "inactive",
+                    "active": false,
+                    "available": true
+                },
+                {
+                    "displayName": "Unavailable",
+                    "vendor": "openai",
+                    "apiModel": "unavailable",
+                    "active": true,
+                    "available": false
+                }
+            ]
+        });
+
+        let routes = parse_model_routes_response(&body).unwrap();
+        assert_eq!(routes.len(), 1);
+        assert_eq!(routes[0].display_name, "GPT Test");
+        assert_eq!(routes[0].vendor, Vendor::OpenAi);
+        assert_eq!(routes[0].api_model, "gpt-test");
+        assert_eq!(routes[0].openrouter_model.as_deref(), Some("openai/gpt-test"));
+    }
+
+    #[test]
+    fn parses_run_spec_response() {
+        let body = json!({
+            "spec_json": {
+                "languages": ["rust", "typescript"],
+                "modes": ["guidelines", "no_context"],
+                "categories": ["basics"],
+                "tasks": ["t_001_basic_tables"],
+                "models": [{
+                    "display_name": "Claude Test",
+                    "vendor": "anthropic",
+                    "api_model": "claude-test",
+                    "openrouter_model": "anthropic/claude-test"
+                }]
+            }
+        });
+
+        let spec = parse_run_spec_response(&body, "run-123").unwrap();
+        assert_eq!(spec.run_id, "run-123");
+        assert_eq!(spec.languages, vec![Lang::Rust, Lang::TypeScript]);
+        assert_eq!(spec.modes, vec!["guidelines", "no_context"]);
+        assert_eq!(spec.categories.as_deref(), Some(&["basics".to_string()][..]));
+        assert_eq!(spec.tasks.as_deref(), Some(&["t_001_basic_tables".to_string()][..]));
+        assert_eq!(spec.routes.len(), 1);
+        assert_eq!(spec.routes[0].vendor, Vendor::Anthropic);
+    }
+}
diff --git a/tools/xtask-llm-benchmark/src/bench/analysis.rs b/tools/xtask-llm-benchmark/src/bench/analysis.rs
index 0234cba1b8f..cb23fbb6cf5 100644
--- a/tools/xtask-llm-benchmark/src/bench/analysis.rs
+++ b/tools/xtask-llm-benchmark/src/bench/analysis.rs
@@ -27,10 +27,10 @@ pub async fn run_analysis(
     let prompt = build_prompt(lang, mode, model_name, bench_root, &failures);
 
     let route = ModelRoute::new(
-        "gpt-4.1-mini",
+        "gpt-5.4-mini",
         crate::llm::types::Vendor::OpenAi,
-        "gpt-4.1-mini",
-        Some("openai/gpt-4.1-mini"),
+        "gpt-5.4-mini",
+        Some("openai/gpt-5.4-mini"),
     );
 
     let built = BuiltPrompt {
diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs
index 68775ff631c..55b8a98d5b5 100644
--- a/tools/xtask-llm-benchmark/src/bench/publishers.rs
+++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs
@@ -73,6 +73,8 @@ fn is_transient_build_error(stderr: &str, stdout: &str) -> bool {
         // trying to extract the same tarball simultaneously
         || (combined.contains("wasi-sdk") && combined.contains("tar"))
         || (combined.contains("MSB3073") && combined.contains("exited with code 2"))
+        // dotnet can crash below spacetime while spacetime exits 1.
+        || combined.contains("code <signal")
 }
 
 fn run(cmd: &mut Command, label: &str) -> Result<()> {
diff --git a/tools/xtask-llm-benchmark/src/bench/runner.rs b/tools/xtask-llm-benchmark/src/bench/runner.rs
index 42acd77a70c..92acc429b70 100644
--- a/tools/xtask-llm-benchmark/src/bench/runner.rs
+++ b/tools/xtask-llm-benchmark/src/bench/runner.rs
@@ -633,7 +633,7 @@ pub async fn run_all_for_model_async_for_lang(cfg: &BenchRunContext<'_>) -> Resu
             }
         };
         if let Some(ref api) = cfg.api_client {
-            api.upload_batch(cfg.mode, &outcomes, analysis.as_deref())?;
+            api.upload_batch(cfg.mode, &outcomes, analysis.as_deref(), cfg.run_id.as_deref())?;
         } else {
             eprintln!("[runner] no API client configured; skipping upload");
         }
@@ -832,7 +832,7 @@ pub async fn run_selected_for_model_async_for_lang(cfg: &BenchRunContext<'_>) ->
             }
         };
         if let Some(ref api) = cfg.api_client {
-            api.upload_batch(cfg.mode, &outcomes, analysis.as_deref())?;
+            api.upload_batch(cfg.mode, &outcomes, analysis.as_deref(), cfg.run_id.as_deref())?;
         } else {
             eprintln!("[runner] no API client configured; skipping upload");
         }
@@ -865,6 +865,7 @@ pub async fn run_selected_or_all_for_model_async_for_lang(ctx: &BenchRunContext<
             dry_run: ctx.dry_run,
             local_analysis: ctx.local_analysis,
             dry_run_id: ctx.dry_run_id.clone(),
+            run_id: ctx.run_id.clone(),
         };
         return run_selected_for_model_async_for_lang(&sel_cfg).await;
     }
diff --git a/tools/xtask-llm-benchmark/src/bench/types.rs b/tools/xtask-llm-benchmark/src/bench/types.rs
index 930e3feac1c..57bbed9a149 100644
--- a/tools/xtask-llm-benchmark/src/bench/types.rs
+++ b/tools/xtask-llm-benchmark/src/bench/types.rs
@@ -188,6 +188,7 @@ pub struct BenchRunContext<'a> {
     pub dry_run: bool,
     pub local_analysis: bool,
     pub dry_run_id: Option<String>,
+    pub run_id: Option<String>,
 }
 
 pub struct RunConfig {
@@ -209,4 +210,8 @@ pub struct RunConfig {
     pub local_analysis: bool,
     /// Shared identifier used to group dry-run artifacts
     pub dry_run_id: Option<String>,
+    /// Website-created run identifier for uploaded results
+    pub run_id: Option<String>,
+    /// Website-provided route list used instead of static default_model_routes()
+    pub route_overrides: Option<Vec<ModelRoute>>,
 }
diff --git a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
index 2a931fbaa4d..219a770b502 100644
--- a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
+++ b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
@@ -1,7 +1,7 @@
 #![allow(clippy::disallowed_macros, clippy::type_complexity, clippy::enum_variant_names)]
 
 use anyhow::{Context, Result};
-use clap::{Args, Parser, Subcommand};
+use clap::{Args, Parser, Subcommand, ValueEnum};
 use futures::{StreamExt, TryStreamExt};
 use spacetimedb_data_structures::map::{HashCollectionExt as _, HashMap, HashSet};
 use spacetimedb_guard::SpacetimeDbGuard;
@@ -71,11 +71,20 @@ struct Cli {
     command: Commands,
 }
 
+#[derive(Clone, Copy, Debug, Eq, PartialEq, ValueEnum)]
+enum ModelSource {
+    Static,
+    Remote,
+}
+
 #[derive(Subcommand, Debug)]
 enum Commands {
     /// Run benchmarks / build goldens / compute hashes.
     Run(RunArgs),
 
+    /// Run a website-created benchmark spec by id.
+    RunFromApi(RunFromApiArgs),
+
     /// Run AI analysis on existing benchmark failures from the database.
     Analyze(AnalyzeArgs),
 }
@@ -124,6 +133,10 @@ struct RunArgs {
     #[arg(long, num_args = 1..)]
     models: Option<Vec<ModelGroup>>,
 
+    /// Where to resolve models when --models is not provided
+    #[arg(long, value_enum, default_value_t = ModelSource::Static)]
+    model_source: ModelSource,
+
     /// Run benchmarks without uploading results
     #[arg(long)]
     dry_run: bool,
@@ -131,6 +144,19 @@ struct RunArgs {
     /// When used with --dry-run, also generate local markdown analysis files
     #[arg(long, requires = "dry_run")]
     local_analysis: bool,
+
+    #[arg(skip)]
+    route_overrides: Option<Vec<ModelRoute>>,
+
+    #[arg(skip)]
+    run_id: Option<String>,
+}
+
+#[derive(Args, Debug, Clone)]
+struct RunFromApiArgs {
+    /// Website-created llm_benchmark_runs id
+    #[arg(long)]
+    run_id: String,
 }
 
 #[derive(Args, Debug, Clone)]
@@ -202,6 +228,7 @@ fn main() -> Result<()> {
 
     match cli.command {
         Commands::Run(args) => cmd_run(args),
+        Commands::RunFromApi(args) => cmd_run_from_api(args),
         Commands::Analyze(args) => cmd_analyze(args),
     }
 }
@@ -213,11 +240,63 @@ fn cmd_run(args: RunArgs) -> Result<()> {
     Ok(())
 }
 
+fn cmd_run_from_api(args: RunFromApiArgs) -> Result<()> {
+    let api = ApiClient::from_env()
+        .context("failed to initialize API client")?
+        .context("LLM_BENCHMARK_UPLOAD_URL required for run-from-api")?;
+    if let Err(e) = api.update_run_status(&args.run_id, "running", None) {
+        eprintln!("[warn] failed to mark website benchmark run as running: {e:#}");
+    }
+
+    let result = cmd_run_from_api_inner(&api, &args.run_id);
+    match result {
+        Ok(()) => {
+            if let Err(e) = api.update_run_status(&args.run_id, "succeeded", None) {
+                eprintln!("[warn] failed to mark website benchmark run as succeeded: {e:#}");
+            }
+            Ok(())
+        }
+        Err(e) => {
+            let message = format!("{e:#}");
+            if let Err(status_err) = api.update_run_status(&args.run_id, "failed", Some(&message)) {
+                eprintln!("[warn] failed to mark website benchmark run as failed: {status_err:#}");
+            }
+            Err(e)
+        }
+    }
+}
+
+fn cmd_run_from_api_inner(api: &ApiClient, run_id: &str) -> Result<()> {
+    let spec = api.fetch_run_spec(run_id)?;
+
+    for lang in &spec.languages {
+        run_benchmarks(RunArgs {
+            modes: Some(spec.modes.clone()),
+            lang: *lang,
+            hash_only: false,
+            goldens_only: false,
+            force: false,
+            categories: spec.categories.clone(),
+            tasks: spec.tasks.clone(),
+            providers: None,
+            models: None,
+            model_source: ModelSource::Static,
+            dry_run: false,
+            local_analysis: false,
+            route_overrides: Some(spec.routes.clone()),
+            run_id: Some(spec.run_id.clone()),
+        })?;
+    }
+
+    Ok(())
+}
+
 /// Core benchmark runner used by both `run` and `ci-quickfix`
 fn run_benchmarks(args: RunArgs) -> Result<()> {
     let dry_run = args.dry_run;
     let local_analysis = args.local_analysis;
     let dry_run_id = dry_run.then(|| chrono::Utc::now().format("%Y-%m-%d_%H%M%S").to_string());
+    let should_fetch_remote_routes = should_fetch_remote_routes(&args);
 
     let api_client = if dry_run {
         None
@@ -244,8 +323,17 @@ fn run_benchmarks(args: RunArgs) -> Result<()> {
         dry_run,
         local_analysis,
         dry_run_id: dry_run_id.clone(),
+        run_id: args.run_id,
+        route_overrides: args.route_overrides,
     };
 
+    if should_fetch_remote_routes {
+        let api = api_client
+            .as_ref()
+            .context("LLM_BENCHMARK_UPLOAD_URL required when --model-source remote is used")?;
+        config.route_overrides = Some(api.fetch_model_routes()?);
+    }
+
     let bench_root = find_bench_root();
 
     // Upload task catalog before running benchmarks
@@ -396,10 +484,10 @@ fn cmd_analyze(args: AnalyzeArgs) -> Result<()> {
     let provider = make_provider_from_env()?;
 
     let analysis_route = ModelRoute::new(
-        "gpt-4.1-mini",
+        "gpt-5.4-mini",
         xtask_llm_benchmark::llm::types::Vendor::OpenAi,
-        "gpt-4.1-mini",
-        Some("openai/gpt-4.1-mini"),
+        "gpt-5.4-mini",
+        Some("openai/gpt-5.4-mini"),
     );
 
     for ((lang, mode, model), group_failures) in &groups {
@@ -534,6 +622,15 @@ fn short_hash(s: &str) -> &str {
     &s[..s.len().min(12)]
 }
 
+fn should_fetch_remote_routes(args: &RunArgs) -> bool {
+    args.model_source == ModelSource::Remote
+        && args.models.is_none()
+        && args.route_overrides.is_none()
+        && !args.dry_run
+        && !args.hash_only
+        && !args.goldens_only
+}
+
 fn preflight_llm_routes(
     runtime: &Runtime,
     llm_provider: &dyn LlmProvider,
@@ -651,7 +748,12 @@ fn run_mode_benchmarks(
 /// When explicit `openrouter:vendor/model` entries are passed they won't appear in
 /// `default_model_routes`, so we synthesize ad-hoc routes for them here.
 fn filter_routes(config: &RunConfig) -> Vec<ModelRoute> {
-    let mut routes: Vec<ModelRoute> = default_model_routes()
+    let base_routes: Vec<ModelRoute> = config
+        .route_overrides
+        .clone()
+        .unwrap_or_else(|| default_model_routes().to_vec());
+
+    let mut routes: Vec<ModelRoute> = base_routes
         .iter()
         .filter(|r| config.providers_filter.as_ref().is_none_or(|f| f.contains(&r.vendor)))
         .filter(|r| match &config.model_filter {
@@ -710,11 +812,13 @@ async fn run_many_routes_for_mode(
     let dry_run = config.dry_run;
     let local_analysis = config.local_analysis;
     let dry_run_id = config.dry_run_id.clone();
+    let run_id = config.run_id.clone();
 
     futures::stream::iter(routes.iter().map(|route| {
         let host = host.clone();
         let api_client = api_client.clone();
         let dry_run_id = dry_run_id.clone();
+        let run_id = run_id.clone();
 
         async move {
             println!("\u{2192} running {}", route.display_name);
@@ -733,6 +837,7 @@ async fn run_many_routes_for_mode(
                 dry_run,
                 local_analysis,
                 dry_run_id,
+                run_id,
             };
 
             let outcomes = run_selected_or_all_for_model_async_for_lang(&per).await?;
@@ -806,8 +911,8 @@ fn find_bench_root() -> PathBuf {
     start.join("src").join("benchmarks")
 }
 
-fn collect_task_numbers_in_categories(bench_root: &Path, cats: &HashSet<String>) -> Result<HashSet<u32>> {
-    let mut nums = HashSet::new();
+fn collect_task_names_in_categories(bench_root: &Path, cats: &HashSet<String>) -> Result<HashSet<String>> {
+    let mut tasks = HashSet::new();
     for c in cats {
         let dir = bench_root.join(c);
         if !dir.is_dir() {
@@ -818,24 +923,38 @@ fn collect_task_numbers_in_categories(bench_root: &Path, cats: &HashSet<String>)
             if !entry.file_type()?.is_dir() {
                 continue;
             }
-            let name = entry.file_name().to_string_lossy().into_owned();
-            if let Some(rest) = name.strip_prefix("t_")
-                && let Some((num_str, _)) = rest.split_once('_')
-                && num_str.len() == 3
-                && let Ok(n) = num_str.parse::<u32>()
-            {
-                nums.insert(n);
-            }
+            tasks.insert(entry.file_name().to_string_lossy().to_ascii_lowercase());
         }
     }
-    Ok(nums)
+    Ok(tasks)
 }
 
-fn normalize_numeric_selectors(raw: &[String]) -> Vec<u32> {
-    raw.iter()
-        .filter(|s| !s.is_empty() && s.chars().all(|c| c.is_ascii_digit()))
-        .filter_map(|s| s.parse::<u32>().ok())
-        .collect()
+fn task_selector_matches_any(selector: &str, allowed_tasks: &HashSet<String>) -> bool {
+    allowed_tasks.iter().any(|task| task.starts_with(selector))
+}
+
+fn normalize_task_filter_selector(raw: &str) -> Result<String> {
+    let s = raw.trim().to_ascii_lowercase();
+    if s.is_empty() {
+        anyhow::bail!("empty task selector");
+    }
+    if let Some(rest) = s.strip_prefix("t_") {
+        if rest.chars().all(|c| c.is_ascii_digit()) {
+            let n: u32 = rest.parse()?;
+            return Ok(format!("t_{:03}", n));
+        }
+        if rest.chars().next().is_some_and(|c| c.is_ascii_digit())
+            && rest.chars().all(|c| c.is_ascii_alphanumeric() || c == '_')
+        {
+            return Ok(s);
+        }
+        anyhow::bail!("invalid task selector: {raw}");
+    }
+    if s.chars().all(|c| c.is_ascii_digit()) {
+        let n: u32 = s.parse()?;
+        return Ok(format!("t_{:03}", n));
+    }
+    anyhow::bail!("invalid task selector: {raw}")
 }
 
 fn apply_category_filter(
@@ -849,23 +968,126 @@ fn apply_category_filter(
             Ok(selectors.map(|s| s.to_vec()))
         }
         Some(cats) => {
-            let allowed = collect_task_numbers_in_categories(bench_root, cats)?;
-            let out_nums: Vec<u32> = match selectors {
+            let allowed = collect_task_names_in_categories(bench_root, cats)?;
+            let mut out: Vec<String> = match selectors {
                 Some(user) => {
-                    let nums = normalize_numeric_selectors(user);
-                    nums.into_iter().filter(|n| allowed.contains(n)).collect()
+                    let mut selected = Vec::new();
+                    for selector in user {
+                        let normalized = normalize_task_filter_selector(selector)?;
+                        if task_selector_matches_any(&normalized, &allowed) {
+                            selected.push(normalized);
+                        }
+                    }
+                    selected
                 }
                 None => {
-                    let mut v: Vec<u32> = allowed.into_iter().collect();
+                    let mut v: Vec<String> = allowed.into_iter().collect();
                     v.sort_unstable();
                     v
                 }
             };
-            if out_nums.is_empty() {
-                Ok(None)
-            } else {
-                Ok(Some(out_nums.into_iter().map(|n| n.to_string()).collect()))
+            out.sort();
+            out.dedup();
+            if out.is_empty() {
+                anyhow::bail!("no tasks matched category/task filters");
             }
+            Ok(Some(out))
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn base_run_args() -> RunArgs {
+        RunArgs {
+            modes: None,
+            lang: Lang::Rust,
+            hash_only: false,
+            goldens_only: false,
+            force: false,
+            categories: None,
+            tasks: None,
+            providers: None,
+            models: None,
+            model_source: ModelSource::Static,
+            dry_run: false,
+            local_analysis: false,
+            route_overrides: None,
+            run_id: None,
+        }
+    }
+
+    fn base_config(route_overrides: Option<Vec<ModelRoute>>) -> RunConfig {
+        RunConfig {
+            modes: None,
+            hash_only: false,
+            goldens_only: false,
+            lang: Lang::Rust,
+            providers_filter: None,
+            selectors: None,
+            force: false,
+            categories: None,
+            model_filter: None,
+            host: None,
+            api_client: None,
+            dry_run: false,
+            local_analysis: false,
+            dry_run_id: None,
+            run_id: None,
+            route_overrides,
+        }
+    }
+
+    #[test]
+    fn remote_model_source_fetches_only_for_implicit_models() {
+        let mut args = base_run_args();
+        args.model_source = ModelSource::Remote;
+        assert!(should_fetch_remote_routes(&args));
+
+        args.models = Some(vec![ModelGroup {
+            vendor: Vendor::OpenAi,
+            models: vec!["gpt-test".to_string()],
+        }]);
+        assert!(!should_fetch_remote_routes(&args));
+    }
+
+    #[test]
+    fn filter_routes_uses_remote_route_override() {
+        let remote_route = ModelRoute::new(
+            "Remote Model",
+            Vendor::OpenRouter,
+            "openai/remote-model",
+            Some("openai/remote-model"),
+        );
+        let config = base_config(Some(vec![remote_route]));
+
+        let routes = filter_routes(&config);
+        assert_eq!(routes.len(), 1);
+        assert_eq!(routes[0].display_name, "Remote Model");
+        assert_eq!(routes[0].api_model, "openai/remote-model");
+    }
+
+    #[test]
+    fn category_filter_accepts_full_task_ids() {
+        let root = std::env::temp_dir().join(format!(
+            "llm-benchmark-test-{}",
+            std::time::SystemTime::now()
+                .duration_since(std::time::UNIX_EPOCH)
+                .unwrap()
+                .as_nanos()
+        ));
+        fs::create_dir_all(root.join("basics").join("t_001_basic_tables")).unwrap();
+        fs::create_dir_all(root.join("schema").join("t_012_product_type")).unwrap();
+
+        let mut categories = HashSet::new();
+        categories.insert("basics".to_string());
+        let selectors = vec!["t_001_basic_tables".to_string(), "t_012_product_type".to_string()];
+
+        let filtered = apply_category_filter(&root, Some(&categories), Some(&selectors)).unwrap();
+        fs::remove_dir_all(&root).unwrap();
+
+        assert_eq!(filtered, Some(vec!["t_001_basic_tables".to_string()]));
+    }
+}
diff --git a/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs b/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs
index c7a057c4638..8bb0d1ac734 100644
--- a/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs
+++ b/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs
@@ -237,6 +237,12 @@ fn anthropic_max_output_tokens() -> u32 {
 pub fn normalize_anthropic_model(id: &str) -> &str {
     let lid = id.to_ascii_lowercase().replace('_', "-");
     match lid.as_str() {
+        // Opus 4.8
+        "opus-4.8" | "claude-opus-4.8" | "claude-opus-4-8" => "claude-opus-4-8",
+
+        // Sonnet 4.6
+        "sonnet-4.6" | "claude-sonnet-4.6" | "claude-sonnet-4-6" => "claude-sonnet-4-6",
+
         // Sonnet 4.5
         "sonnet-4.5" | "claude-sonnet-4.5" | "claude-sonnet-4-5" => "claude-sonnet-4-5",
         "claude-sonnet-4-5-20250929" => "claude-sonnet-4-5-20250929",
diff --git a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs
index d35ce789d17..54e0532db34 100644
--- a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs
+++ b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs
@@ -275,26 +275,44 @@ pub fn openrouter_ctx_limit_tokens(model: &str) -> usize {
 
     // Anthropic
     if m.contains("claude") {
+        if m.contains("4.6")
+            || m.contains("4-6")
+            || m.contains("4.7")
+            || m.contains("4-7")
+            || m.contains("4.8")
+            || m.contains("4-8")
+        {
+            return 1_000_000;
+        }
         return 185_000;
     }
     // OpenAI
+    if m.contains("gpt-5.5") {
+        return 1_050_000;
+    }
     if m.contains("gpt-5") || m.contains("gpt-4.1") {
         return 400_000;
     }
     if m.contains("gpt-4o") || m.contains("gpt-4") {
         return 128_000;
     }
-    // xAI / Grok — leave ~50 k headroom for segments + output on top of trimmed prefix
-    if m.contains("grok-code-fast") {
+    // xAI / Grok
+    if m.contains("grok-build-0.1") || m.contains("grok-code-fast") {
         return 200_000;
     }
+    if m.contains("grok-4.3") {
+        return 1_000_000;
+    }
     if m.contains("grok-4") {
         return 200_000;
     }
     if m.contains("grok") {
         return 90_000;
     }
-    // DeepSeek — hard cap is 131 072 on OpenRouter; leave ~25 k headroom for segments + output
+    // DeepSeek
+    if m.contains("deepseek-v4") {
+        return 1_000_000;
+    }
     if m.contains("deepseek") {
         return 106_000;
     }
diff --git a/tools/xtask-llm-benchmark/src/llm/model_routes.rs b/tools/xtask-llm-benchmark/src/llm/model_routes.rs
index e136976adb6..7f7ae93b66c 100644
--- a/tools/xtask-llm-benchmark/src/llm/model_routes.rs
+++ b/tools/xtask-llm-benchmark/src/llm/model_routes.rs
@@ -13,16 +13,16 @@ pub struct ModelRoute {
 static DEFAULT_ROUTES: LazyLock<Vec<ModelRoute>> = LazyLock::new(|| {
     use Vendor::*;
     vec![
-        // OpenAI: Best GPT-5.2-Codex, Cheaper GPT-5-mini
-        ModelRoute::new("GPT-5.2-Codex", OpenAi, "gpt-5.2-codex", Some("openai/gpt-5.2-codex")),
-        ModelRoute::new("GPT-5-mini", OpenAi, "gpt-5-mini", Some("openai/gpt-5-mini")),
-        // Claude: Best Opus 4.6, Cheaper Sonnet 4.6
-        // Direct API uses dashes (claude-opus-4-6); OpenRouter uses dots (claude-opus-4.6)
+        // OpenAI: Best GPT-5.5, Cheaper GPT-5.4-mini
+        ModelRoute::new("GPT-5.5", OpenAi, "gpt-5.5", Some("openai/gpt-5.5")),
+        ModelRoute::new("GPT-5.4-mini", OpenAi, "gpt-5.4-mini", Some("openai/gpt-5.4-mini")),
+        // Claude: Best Opus 4.8, Cheaper Sonnet 4.6
+        // Direct API uses dashes (claude-opus-4-8); OpenRouter uses dots (claude-opus-4.8)
         ModelRoute::new(
-            "Claude Opus 4.6",
+            "Claude Opus 4.8",
             Anthropic,
-            "claude-opus-4-6",
-            Some("anthropic/claude-opus-4.6"),
+            "claude-opus-4-8",
+            Some("anthropic/claude-opus-4.8"),
         ),
         ModelRoute::new(
             "Claude Sonnet 4.6",
@@ -30,9 +30,9 @@ static DEFAULT_ROUTES: LazyLock<Vec<ModelRoute>> = LazyLock::new(|| {
             "claude-sonnet-4-6",
             Some("anthropic/claude-sonnet-4.6"),
         ),
-        // Grok: Best Grok 4, Cheaper Grok Code
-        ModelRoute::new("Grok 4", Xai, "grok-4", Some("x-ai/grok-4.20-beta")),
-        ModelRoute::new("Grok Code", Xai, "grok-code-fast-1", Some("x-ai/grok-code-fast-1")),
+        // Grok: Best Grok 4.3, coding-specialized Grok Build
+        ModelRoute::new("Grok 4.3", Xai, "grok-4.3", Some("x-ai/grok-4.3")),
+        ModelRoute::new("Grok Build 0.1", Xai, "grok-build-0.1", Some("x-ai/grok-build-0.1")),
         // Gemini: direct via GOOGLE_API_KEY, falls back to OpenRouter if not set
         ModelRoute::new(
             "Gemini 3.1 Pro",
@@ -41,24 +41,23 @@ static DEFAULT_ROUTES: LazyLock<Vec<ModelRoute>> = LazyLock::new(|| {
             Some("google/gemini-3.1-pro-preview"),
         ),
         ModelRoute::new(
-            "Gemini 3 Flash",
+            "Gemini 3.5 Flash",
             Google,
-            "gemini-3-flash-preview",
-            Some("google/gemini-3-flash-preview"),
+            "gemini-3.5-flash",
+            Some("google/gemini-3.5-flash"),
         ),
-        // DeepSeek: Reasoner (thinking), Chat (general)
-        // deepseek-reasoner is listed as deepseek-r1 on OpenRouter
+        // DeepSeek: Pro (highest capability), Flash (cheaper/faster)
         ModelRoute::new(
-            "DeepSeek Reasoner",
+            "DeepSeek V4 Pro",
             DeepSeek,
-            "deepseek-reasoner",
-            Some("deepseek/deepseek-r1"),
+            "deepseek-v4-pro",
+            Some("deepseek/deepseek-v4-pro"),
         ),
         ModelRoute::new(
-            "DeepSeek Chat",
+            "DeepSeek V4 Flash",
             DeepSeek,
-            "deepseek-chat",
-            Some("deepseek/deepseek-chat"),
+            "deepseek-v4-flash",
+            Some("deepseek/deepseek-v4-flash"),
         ),
     ]
 });
diff --git a/tools/xtask-llm-benchmark/src/llm/segmentation.rs b/tools/xtask-llm-benchmark/src/llm/segmentation.rs
index 2926852ada0..26bc481e52f 100644
--- a/tools/xtask-llm-benchmark/src/llm/segmentation.rs
+++ b/tools/xtask-llm-benchmark/src/llm/segmentation.rs
@@ -88,14 +88,29 @@ pub fn build_anthropic_messages(
 }
 
 // Provider-specific context limits
-pub fn anthropic_ctx_limit_tokens(_model: &str) -> usize {
-    // Anthropic hard limit is 200k; reserve ~15k for tokenizer variance + system/segments
+pub fn anthropic_ctx_limit_tokens(model: &str) -> usize {
+    let m = model.to_ascii_lowercase();
+
+    // Newer Claude 4.6+ models expose a 1M context window.
+    if m.contains("4-6")
+        || m.contains("4.6")
+        || m.contains("4-7")
+        || m.contains("4.7")
+        || m.contains("4-8")
+        || m.contains("4.8")
+    {
+        return 1_000_000;
+    }
+
+    // Older Anthropic models are 200k; reserve ~15k for tokenizer variance + system/segments.
     185_000
 }
 
 pub fn openai_ctx_limit_tokens(model: &str) -> usize {
     let m = model.to_ascii_lowercase();
-    if m.contains("gpt-5") || m.contains("gpt-4.1") {
+    if m.contains("gpt-5.5") {
+        1_050_000
+    } else if m.contains("gpt-5") || m.contains("gpt-4.1") {
         400_000
     } else {
         128_000
@@ -105,7 +120,13 @@ pub fn openai_ctx_limit_tokens(model: &str) -> usize {
 pub fn deepseek_ctx_limit_tokens(model: &str) -> usize {
     let m = model.to_ascii_lowercase();
 
-    // API limit 128K for deepseek-chat and deepseek-reasoner
+    if m.starts_with("deepseek-v4") {
+        return 1_000_000;
+    }
+    if m.starts_with("deepseek-v3.2") {
+        return 128_000;
+    }
+    // API limit 128K for deepseek-chat and deepseek-reasoner compatibility aliases.
     if m.starts_with("deepseek-reasoner") || m.starts_with("deepseek-r1") {
         return 128_000;
     }
@@ -123,8 +144,8 @@ pub fn deepseek_ctx_limit_tokens(model: &str) -> usize {
 pub fn gemini_ctx_limit_tokens(model: &str) -> usize {
     let m = model.to_ascii_lowercase();
 
-    // Gemini 2.5 series (very large)
-    if m.contains("2.5") && (m.contains("pro") || m.contains("flash")) {
+    // Gemini 3.x and 2.5 series (very large)
+    if (m.contains("3.") || m.contains("2.5")) && (m.contains("pro") || m.contains("flash")) {
         return 1_000_000;
     }
 
@@ -160,9 +181,12 @@ pub fn meta_ctx_limit_tokens(model: &str) -> usize {
 
 pub fn xai_ctx_limit_tokens(model: &str) -> usize {
     let m = model.to_ascii_lowercase();
-    if m.contains("grok-code-fast-1") {
+    if m.contains("grok-build-0.1") || m.contains("grok-code-fast-1") {
         return 256_000;
     }
+    if m.contains("grok-4.3") {
+        return 1_000_000;
+    }
     if m.contains("grok-4") || m.contains("grok-3") {
         return 128_000;
     }

From bcdb41de96cebae1eb721b5a148e074c251fac40 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 09:13:52 -0400
Subject: [PATCH 04/25] preflight credit checks; workflow update to use web

---
 .github/workflows/llm-benchmark-periodic.yml  |  36 ++-
 tools/xtask-llm-benchmark/src/api/client.rs   | 265 ++++--------------
 tools/xtask-llm-benchmark/src/bench/runner.rs |  30 +-
 tools/xtask-llm-benchmark/src/bench/types.rs  |   3 -
 .../src/bin/llm_benchmark.rs                  |  98 ++-----
 .../src/llm/clients/mod.rs                    |   4 +-
 .../src/llm/clients/openrouter.rs             | 120 +++++++-
 7 files changed, 239 insertions(+), 317 deletions(-)

diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml
index c0d9cc93d8c..183ba1c0ea9 100644
--- a/.github/workflows/llm-benchmark-periodic.yml
+++ b/.github/workflows/llm-benchmark-periodic.yml
@@ -6,10 +6,6 @@ on:
     - cron: '0 0 * * 1'
   workflow_dispatch:
     inputs:
-      run_id:
-        description: 'Website-created benchmark run id. When set, run spec is fetched from the website.'
-        required: false
-        default: ''
       models:
         description: 'Models to run (provider:model format, comma-separated, or "all")'
         required: false
@@ -22,6 +18,14 @@ on:
         description: 'Modes to run (comma-separated: guidelines,no_context,docs,...)'
         required: false
         default: 'guidelines,no_context'
+      categories:
+        description: 'Optional benchmark categories to run (comma-separated)'
+        required: false
+        default: ''
+      tasks:
+        description: 'Optional benchmark task ids/selectors to run (comma-separated)'
+        required: false
+        default: ''
 
 permissions:
   contents: read
@@ -92,33 +96,39 @@ jobs:
           MSBUILDDISABLENODEREUSE: "1"
           DOTNET_CLI_USE_MSBUILD_SERVER: "0"
           LLM_BENCH_CSHARP_CONCURRENCY: "1"
-          INPUT_RUN_ID: ${{ inputs.run_id || '' }}
           INPUT_LANGUAGES: ${{ inputs.languages || 'rust,csharp,typescript' }}
           INPUT_MODELS: ${{ inputs.models || 'all' }}
           INPUT_MODES: ${{ inputs.modes || 'guidelines,no_context' }}
+          INPUT_CATEGORIES: ${{ inputs.categories || '' }}
+          INPUT_TASKS: ${{ inputs.tasks || '' }}
         run: |
-          RUN_ID="$INPUT_RUN_ID"
           LANGS="$INPUT_LANGUAGES"
           MODELS="$INPUT_MODELS"
           MODES="$INPUT_MODES"
-
-          if [ -n "$RUN_ID" ]; then
-            llm_benchmark run-from-api --run-id "$RUN_ID"
-            exit $?
-          fi
+          CATEGORIES="$INPUT_CATEGORIES"
+          TASKS="$INPUT_TASKS"
 
           SUCCEEDED=0
           FAILED=0
           for LANG in $(echo "$LANGS" | tr ',' ' '); do
+            EXTRA_ARGS=()
+            if [ -n "$CATEGORIES" ]; then
+              EXTRA_ARGS+=(--categories "$CATEGORIES")
+            fi
+            if [ -n "$TASKS" ]; then
+              EXTRA_ARGS+=(--tasks "$TASKS")
+            fi
+
             if [ "$MODELS" = "all" ]; then
-              if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote; then
+              if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote "${EXTRA_ARGS[@]}"; then
                 SUCCEEDED=$((SUCCEEDED + 1))
               else
                 echo "::warning::Benchmark run failed for lang=$LANG"
                 FAILED=$((FAILED + 1))
               fi
             else
-              if llm_benchmark run --lang "$LANG" --modes "$MODES" --models "$MODELS"; then
+              read -r -a MODEL_ARGS <<< "$MODELS"
+              if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote --models "${MODEL_ARGS[@]}" "${EXTRA_ARGS[@]}"; then
                 SUCCEEDED=$((SUCCEEDED + 1))
               else
                 echo "::warning::Benchmark run failed for lang=$LANG models=$MODELS"
diff --git a/tools/xtask-llm-benchmark/src/api/client.rs b/tools/xtask-llm-benchmark/src/api/client.rs
index 13fde0f8e81..0b43ccb5bac 100644
--- a/tools/xtask-llm-benchmark/src/api/client.rs
+++ b/tools/xtask-llm-benchmark/src/api/client.rs
@@ -1,53 +1,45 @@
 use anyhow::{anyhow, Context, Result};
-use serde::Deserialize;
 use serde_json::json;
-use std::str::FromStr;
 
 use crate::bench::normalize::{canonical_mode, normalize_model_names};
 use crate::bench::types::{Results, RunOutcome};
-use crate::eval::Lang;
 use crate::llm::types::Vendor;
 use crate::llm::ModelRoute;
 
-#[derive(Debug, Clone)]
-pub struct RemoteRunSpec {
-    pub run_id: String,
-    pub languages: Vec<Lang>,
-    pub modes: Vec<String>,
-    pub routes: Vec<ModelRoute>,
-    pub categories: Option<Vec<String>>,
-    pub tasks: Option<Vec<String>>,
-}
-
-#[derive(Debug, Deserialize)]
+#[derive(Debug)]
 struct RemoteModelRouteRow {
-    #[serde(alias = "displayName", alias = "name")]
     display_name: String,
     vendor: String,
-    #[serde(alias = "apiModel")]
     api_model: String,
-    #[serde(default, alias = "openrouterModel")]
     openrouter_model: Option<String>,
-    #[serde(default)]
     active: Option<bool>,
-    #[serde(default)]
     available: Option<bool>,
 }
 
-#[derive(Debug, Deserialize)]
-struct RawRunSpec {
-    #[serde(default)]
-    id: Option<String>,
-    #[serde(default, alias = "runId")]
-    run_id: Option<String>,
-    languages: Vec<String>,
-    modes: Vec<String>,
-    #[serde(default, alias = "routes")]
-    models: Vec<RemoteModelRouteRow>,
-    #[serde(default)]
-    categories: Option<Vec<String>>,
-    #[serde(default)]
-    tasks: Option<Vec<String>>,
+fn read_string_field(row: &serde_json::Map<String, serde_json::Value>, keys: &[&str]) -> Option<String> {
+    keys.iter()
+        .find_map(|key| row.get(*key).and_then(|value| value.as_str()))
+        .map(str::to_string)
+}
+
+fn read_bool_field(row: &serde_json::Map<String, serde_json::Value>, keys: &[&str]) -> Option<bool> {
+    keys.iter()
+        .find_map(|key| row.get(*key).and_then(|value| value.as_bool()))
+}
+
+fn parse_model_route_value(value: serde_json::Value) -> Result<RemoteModelRouteRow> {
+    let row = value
+        .as_object()
+        .ok_or_else(|| anyhow!("remote model row must be an object"))?;
+
+    Ok(RemoteModelRouteRow {
+        display_name: read_string_field(row, &["display_name", "displayName", "name"]).unwrap_or_default(),
+        vendor: read_string_field(row, &["vendor"]).unwrap_or_default(),
+        api_model: read_string_field(row, &["api_model", "apiModel"]).unwrap_or_default(),
+        openrouter_model: read_string_field(row, &["openrouter_model", "openrouterModel"]),
+        active: read_bool_field(row, &["active"]),
+        available: read_bool_field(row, &["available"]),
+    })
 }
 
 fn parse_model_route_row(row: RemoteModelRouteRow) -> Result<Option<ModelRoute>> {
@@ -76,11 +68,12 @@ fn parse_model_route_row(row: RemoteModelRouteRow) -> Result<Option<ModelRoute>>
 
 pub fn parse_model_routes_response(body: &serde_json::Value) -> Result<Vec<ModelRoute>> {
     let models = body.get("models").unwrap_or(body);
-    let rows: Vec<RemoteModelRouteRow> =
+    let rows: Vec<serde_json::Value> =
         serde_json::from_value(models.clone()).context("parse llm benchmark model rows")?;
 
     let mut routes = Vec::new();
-    for row in rows {
+    for row in rows.into_iter().map(parse_model_route_value) {
+        let row = row?;
         if let Some(route) = parse_model_route_row(row)? {
             routes.push(route);
         }
@@ -93,83 +86,33 @@ pub fn parse_model_routes_response(body: &serde_json::Value) -> Result<Vec<Model
     Ok(routes)
 }
 
-pub fn parse_run_spec_response(body: &serde_json::Value, fallback_run_id: &str) -> Result<RemoteRunSpec> {
-    let spec = body.get("spec").or_else(|| body.get("spec_json")).unwrap_or(body);
-    let spec = match spec.as_str() {
-        Some(s) => serde_json::from_str::<serde_json::Value>(s).context("parse run spec_json string")?,
-        None => spec.clone(),
-    };
-
-    let raw: RawRunSpec = serde_json::from_value(spec).context("parse llm benchmark run spec")?;
-    let run_id = raw.run_id.or(raw.id).unwrap_or_else(|| fallback_run_id.to_string());
-
-    let languages = raw
-        .languages
-        .iter()
-        .map(|lang| Lang::from_str(lang).map_err(|e| anyhow!(e)))
-        .collect::<Result<Vec<_>>>()?;
-    if languages.is_empty() {
-        anyhow::bail!("run spec '{}' has no languages", run_id);
-    }
-
-    let modes: Vec<String> = raw
-        .modes
-        .into_iter()
-        .map(|mode| mode.trim().to_string())
-        .filter(|mode| !mode.is_empty())
-        .collect();
-    if modes.is_empty() {
-        anyhow::bail!("run spec '{}' has no modes", run_id);
-    }
-
-    let mut routes = Vec::new();
-    for row in raw.models {
-        if let Some(route) = parse_model_route_row(row)? {
-            routes.push(route);
-        }
-    }
-    if routes.is_empty() {
-        anyhow::bail!("run spec '{}' has no active available models", run_id);
-    }
-
-    Ok(RemoteRunSpec {
-        run_id,
-        languages,
-        modes,
-        routes,
-        categories: raw.categories,
-        tasks: raw.tasks,
-    })
-}
-
 /// HTTP client for the SpacetimeDB LLM benchmark API (spacetime-web Postgres).
 ///
 /// Supports endpoints owned by spacetime-web:
 /// - `POST /api/llm-benchmark-upload` - upload benchmark results
 /// - `POST /api/llm-benchmark-tasks` - upload task catalog
 /// - `GET /api/llm-benchmark-models?active=true` - fetch active benchmark models
-/// - `GET /api/llm-benchmark-runs/{run_id}` - fetch admin-triggered run specs
-/// - `PATCH /api/llm-benchmark-runs/{run_id}` - update admin-triggered run status
 #[derive(Clone)]
 pub struct ApiClient {
-    client: reqwest::blocking::Client,
     base_url: String,
     api_key: String,
 }
 
 impl ApiClient {
     pub fn new(base_url: &str, api_key: &str) -> Result<Self> {
-        let client = reqwest::blocking::Client::builder()
-            .timeout(std::time::Duration::from_secs(120))
-            .build()
-            .context("failed to build HTTP client")?;
         Ok(Self {
-            client,
             base_url: base_url.trim_end_matches('/').to_string(),
             api_key: api_key.to_string(),
         })
     }
 
+    fn client(&self) -> Result<reqwest::blocking::Client> {
+        reqwest::blocking::Client::builder()
+            .timeout(std::time::Duration::from_secs(120))
+            .build()
+            .context("failed to build HTTP client")
+    }
+
     /// Build from environment variables `LLM_BENCHMARK_UPLOAD_URL` and `LLM_BENCHMARK_API_KEY`.
     /// Returns `None` if `LLM_BENCHMARK_UPLOAD_URL` is not set.
     pub fn from_env() -> Result<Option<Self>> {
@@ -185,13 +128,7 @@ impl ApiClient {
     /// Upload a batch of run outcomes for a single (lang, mode) combination.
     /// Normalizes model names and sanitizes volatile fields before upload.
     /// If `analysis` is provided, it is stored in the `llm_benchmark_analysis` table.
-    pub fn upload_batch(
-        &self,
-        mode: &str,
-        outcomes: &[RunOutcome],
-        analysis: Option<&str>,
-        run_id: Option<&str>,
-    ) -> Result<usize> {
+    pub fn upload_batch(&self, mode: &str, outcomes: &[RunOutcome], analysis: Option<&str>) -> Result<usize> {
         if outcomes.is_empty() {
             return Ok(0);
         }
@@ -218,6 +155,7 @@ impl ApiClient {
         normalize_model_names(&mut results);
 
         let url = format!("{}/api/llm-benchmark-upload", self.base_url);
+        let client = self.client()?;
         let mut total_uploaded = 0usize;
 
         for lang_entry in &results.languages {
@@ -232,18 +170,14 @@ impl ApiClient {
                     }
                 }
 
-                let mut payload = json!({
+                let payload = json!({
                     "lang": lang_entry.lang,
                     "mode": mode_entry.mode,
                     "hash": mode_entry.hash,
                     "models": models_json,
                 });
-                if let Some(run_id) = run_id {
-                    payload["run_id"] = json!(run_id);
-                }
 
-                let resp = self
-                    .client
+                let resp = client
                     .post(&url)
                     .header("Authorization", format!("Bearer {}", self.api_key))
                     .header("Content-Type", "application/json")
@@ -280,7 +214,7 @@ impl ApiClient {
     pub fn fetch_model_routes(&self) -> Result<Vec<ModelRoute>> {
         let url = format!("{}/api/llm-benchmark-models?active=true", self.base_url);
         let resp = self
-            .client
+            .client()?
             .get(&url)
             .header("Authorization", format!("Bearer {}", self.api_key))
             .send()
@@ -296,80 +230,6 @@ impl ApiClient {
         }
     }
 
-    /// Fetch an immutable website-created run spec for admin-triggered runs.
-    pub fn fetch_run_spec(&self, run_id: &str) -> Result<RemoteRunSpec> {
-        let run_id_path = urlencoding::encode(run_id);
-        let url = format!("{}/api/llm-benchmark-runs/{}", self.base_url, run_id_path);
-        let resp = self
-            .client
-            .get(&url)
-            .header("Authorization", format!("Bearer {}", self.api_key))
-            .send()
-            .with_context(|| format!("fetch LLM benchmark run spec failed for {run_id}"))?;
-
-        if resp.status().is_success() {
-            let body: serde_json::Value = resp.json().context("parse run spec response")?;
-            parse_run_spec_response(&body, run_id)
-        } else {
-            let status = resp.status();
-            let body = resp.text().unwrap_or_default();
-            anyhow::bail!(
-                "fetch LLM benchmark run spec failed for {}: {} - {}",
-                run_id,
-                status,
-                body
-            );
-        }
-    }
-
-    /// Update website-created benchmark run status.
-    pub fn update_run_status(&self, run_id: &str, status: &str, error: Option<&str>) -> Result<()> {
-        let run_id_path = urlencoding::encode(run_id);
-        let url = format!("{}/api/llm-benchmark-runs/{}", self.base_url, run_id_path);
-        let mut payload = json!({
-            "status": status,
-        });
-        if let Some(error) = error {
-            payload["error"] = json!(error);
-        }
-        if let Ok(github_run_id) = std::env::var("GITHUB_RUN_ID")
-            && !github_run_id.is_empty()
-        {
-            payload["github_run_id"] = json!(github_run_id);
-            if let (Ok(server_url), Ok(repo)) = (std::env::var("GITHUB_SERVER_URL"), std::env::var("GITHUB_REPOSITORY"))
-            {
-                payload["github_run_url"] = json!(format!(
-                    "{}/{}/actions/runs/{}",
-                    server_url.trim_end_matches('/'),
-                    repo,
-                    payload["github_run_id"].as_str().unwrap_or_default()
-                ));
-            }
-        }
-
-        let resp = self
-            .client
-            .patch(&url)
-            .header("Authorization", format!("Bearer {}", self.api_key))
-            .header("Content-Type", "application/json")
-            .json(&payload)
-            .send()
-            .with_context(|| format!("update LLM benchmark run status failed for {run_id}"))?;
-
-        if resp.status().is_success() {
-            Ok(())
-        } else {
-            let status_code = resp.status();
-            let body = resp.text().unwrap_or_default();
-            anyhow::bail!(
-                "update LLM benchmark run status failed for {}: {} - {}",
-                run_id,
-                status_code,
-                body
-            );
-        }
-    }
-
     /// Upload the task catalog to `POST /api/llm-benchmark-tasks`, derived from
     /// the benchmarks directory structure on disk.
     pub fn upload_task_catalog(&self, bench_root: &std::path::Path) -> Result<usize> {
@@ -451,7 +311,7 @@ impl ApiClient {
         let payload = json!({ "categories": categories });
 
         let resp = self
-            .client
+            .client()?
             .post(&url)
             .header("Authorization", format!("Bearer {}", self.api_key))
             .header("Content-Type", "application/json")
@@ -483,7 +343,7 @@ impl ApiClient {
         let url = format!("{}/api/llm-benchmark-results?{}", self.base_url, params.join("&"));
 
         let resp = self
-            .client
+            .client()?
             .get(&url)
             .header("Authorization", format!("Bearer {}", self.api_key))
             .send()
@@ -526,7 +386,7 @@ impl ApiClient {
         let url = format!("{}/api/llm-benchmark-results?{}", self.base_url, params.join("&"));
 
         let resp = self
-            .client
+            .client()?
             .get(&url)
             .header("Authorization", format!("Bearer {}", self.api_key))
             .send()
@@ -560,7 +420,7 @@ impl ApiClient {
 
         let url = format!("{}/api/llm-benchmark-upload", self.base_url);
         let resp = self
-            .client
+            .client()?
             .post(&url)
             .header("Authorization", format!("Bearer {}", self.api_key))
             .header("Content-Type", "application/json")
@@ -621,29 +481,24 @@ mod tests {
     }
 
     #[test]
-    fn parses_run_spec_response() {
+    fn parses_snake_case_model_route_fields() {
         let body = json!({
-            "spec_json": {
-                "languages": ["rust", "typescript"],
-                "modes": ["guidelines", "no_context"],
-                "categories": ["basics"],
-                "tasks": ["t_001_basic_tables"],
-                "models": [{
-                    "display_name": "Claude Test",
-                    "vendor": "anthropic",
-                    "api_model": "claude-test",
-                    "openrouter_model": "anthropic/claude-test"
-                }]
-            }
+            "models": [
+                {
+                    "display_name": "GPT Test",
+                    "vendor": "openai",
+                    "api_model": "gpt-test",
+                    "openrouter_model": "openai/gpt-test",
+                    "active": true,
+                    "available": true
+                }
+            ]
         });
 
-        let spec = parse_run_spec_response(&body, "run-123").unwrap();
-        assert_eq!(spec.run_id, "run-123");
-        assert_eq!(spec.languages, vec![Lang::Rust, Lang::TypeScript]);
-        assert_eq!(spec.modes, vec!["guidelines", "no_context"]);
-        assert_eq!(spec.categories.as_deref(), Some(&["basics".to_string()][..]));
-        assert_eq!(spec.tasks.as_deref(), Some(&["t_001_basic_tables".to_string()][..]));
-        assert_eq!(spec.routes.len(), 1);
-        assert_eq!(spec.routes[0].vendor, Vendor::Anthropic);
+        let routes = parse_model_routes_response(&body).unwrap();
+        assert_eq!(routes.len(), 1);
+        assert_eq!(routes[0].display_name, "GPT Test");
+        assert_eq!(routes[0].api_model, "gpt-test");
+        assert_eq!(routes[0].openrouter_model.as_deref(), Some("openai/gpt-test"));
     }
 }
diff --git a/tools/xtask-llm-benchmark/src/bench/runner.rs b/tools/xtask-llm-benchmark/src/bench/runner.rs
index 92acc429b70..2536b5e5fe1 100644
--- a/tools/xtask-llm-benchmark/src/bench/runner.rs
+++ b/tools/xtask-llm-benchmark/src/bench/runner.rs
@@ -473,6 +473,23 @@ async fn maybe_generate_analysis(cfg: &BenchRunContext<'_>, outcomes: &[RunOutco
     Ok(analysis)
 }
 
+async fn upload_batch_for_context(
+    cfg: &BenchRunContext<'_>,
+    outcomes: &[RunOutcome],
+    analysis: Option<&str>,
+) -> Result<()> {
+    if let Some(api) = cfg.api_client.clone() {
+        let mode = cfg.mode.to_string();
+        let outcomes = outcomes.to_vec();
+        let analysis = analysis.map(str::to_string);
+        tokio::task::spawn_blocking(move || api.upload_batch(&mode, &outcomes, analysis.as_deref())).await??;
+    } else {
+        eprintln!("[runner] no API client configured; skipping upload");
+    }
+
+    Ok(())
+}
+
 pub async fn run_all_for_model_async_for_lang(cfg: &BenchRunContext<'_>) -> Result<Vec<RunOutcome>> {
     let total_wall = Instant::now();
 
@@ -632,11 +649,7 @@ pub async fn run_all_for_model_async_for_lang(cfg: &BenchRunContext<'_>) -> Resu
                 None
             }
         };
-        if let Some(ref api) = cfg.api_client {
-            api.upload_batch(cfg.mode, &outcomes, analysis.as_deref(), cfg.run_id.as_deref())?;
-        } else {
-            eprintln!("[runner] no API client configured; skipping upload");
-        }
+        upload_batch_for_context(cfg, &outcomes, analysis.as_deref()).await?;
     } else {
         eprintln!("[runner] no results; skipping upload");
     }
@@ -831,11 +844,7 @@ pub async fn run_selected_for_model_async_for_lang(cfg: &BenchRunContext<'_>) ->
                 None
             }
         };
-        if let Some(ref api) = cfg.api_client {
-            api.upload_batch(cfg.mode, &outcomes, analysis.as_deref(), cfg.run_id.as_deref())?;
-        } else {
-            eprintln!("[runner] no API client configured; skipping upload");
-        }
+        upload_batch_for_context(cfg, &outcomes, analysis.as_deref()).await?;
     }
 
     println!(
@@ -865,7 +874,6 @@ pub async fn run_selected_or_all_for_model_async_for_lang(ctx: &BenchRunContext<
             dry_run: ctx.dry_run,
             local_analysis: ctx.local_analysis,
             dry_run_id: ctx.dry_run_id.clone(),
-            run_id: ctx.run_id.clone(),
         };
         return run_selected_for_model_async_for_lang(&sel_cfg).await;
     }
diff --git a/tools/xtask-llm-benchmark/src/bench/types.rs b/tools/xtask-llm-benchmark/src/bench/types.rs
index 57bbed9a149..e54df0d4902 100644
--- a/tools/xtask-llm-benchmark/src/bench/types.rs
+++ b/tools/xtask-llm-benchmark/src/bench/types.rs
@@ -188,7 +188,6 @@ pub struct BenchRunContext<'a> {
     pub dry_run: bool,
     pub local_analysis: bool,
     pub dry_run_id: Option<String>,
-    pub run_id: Option<String>,
 }
 
 pub struct RunConfig {
@@ -210,8 +209,6 @@ pub struct RunConfig {
     pub local_analysis: bool,
     /// Shared identifier used to group dry-run artifacts
     pub dry_run_id: Option<String>,
-    /// Website-created run identifier for uploaded results
-    pub run_id: Option<String>,
     /// Website-provided route list used instead of static default_model_routes()
     pub route_overrides: Option<Vec<ModelRoute>>,
 }
diff --git a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
index 219a770b502..0d6d1f7374f 100644
--- a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
+++ b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
@@ -82,9 +82,6 @@ enum Commands {
     /// Run benchmarks / build goldens / compute hashes.
     Run(RunArgs),
 
-    /// Run a website-created benchmark spec by id.
-    RunFromApi(RunFromApiArgs),
-
     /// Run AI analysis on existing benchmark failures from the database.
     Analyze(AnalyzeArgs),
 }
@@ -147,16 +144,6 @@ struct RunArgs {
 
     #[arg(skip)]
     route_overrides: Option<Vec<ModelRoute>>,
-
-    #[arg(skip)]
-    run_id: Option<String>,
-}
-
-#[derive(Args, Debug, Clone)]
-struct RunFromApiArgs {
-    /// Website-created llm_benchmark_runs id
-    #[arg(long)]
-    run_id: String,
 }
 
 #[derive(Args, Debug, Clone)]
@@ -228,7 +215,6 @@ fn main() -> Result<()> {
 
     match cli.command {
         Commands::Run(args) => cmd_run(args),
-        Commands::RunFromApi(args) => cmd_run_from_api(args),
         Commands::Analyze(args) => cmd_analyze(args),
     }
 }
@@ -240,57 +226,6 @@ fn cmd_run(args: RunArgs) -> Result<()> {
     Ok(())
 }
 
-fn cmd_run_from_api(args: RunFromApiArgs) -> Result<()> {
-    let api = ApiClient::from_env()
-        .context("failed to initialize API client")?
-        .context("LLM_BENCHMARK_UPLOAD_URL required for run-from-api")?;
-    if let Err(e) = api.update_run_status(&args.run_id, "running", None) {
-        eprintln!("[warn] failed to mark website benchmark run as running: {e:#}");
-    }
-
-    let result = cmd_run_from_api_inner(&api, &args.run_id);
-    match result {
-        Ok(()) => {
-            if let Err(e) = api.update_run_status(&args.run_id, "succeeded", None) {
-                eprintln!("[warn] failed to mark website benchmark run as succeeded: {e:#}");
-            }
-            Ok(())
-        }
-        Err(e) => {
-            let message = format!("{e:#}");
-            if let Err(status_err) = api.update_run_status(&args.run_id, "failed", Some(&message)) {
-                eprintln!("[warn] failed to mark website benchmark run as failed: {status_err:#}");
-            }
-            Err(e)
-        }
-    }
-}
-
-fn cmd_run_from_api_inner(api: &ApiClient, run_id: &str) -> Result<()> {
-    let spec = api.fetch_run_spec(run_id)?;
-
-    for lang in &spec.languages {
-        run_benchmarks(RunArgs {
-            modes: Some(spec.modes.clone()),
-            lang: *lang,
-            hash_only: false,
-            goldens_only: false,
-            force: false,
-            categories: spec.categories.clone(),
-            tasks: spec.tasks.clone(),
-            providers: None,
-            models: None,
-            model_source: ModelSource::Static,
-            dry_run: false,
-            local_analysis: false,
-            route_overrides: Some(spec.routes.clone()),
-            run_id: Some(spec.run_id.clone()),
-        })?;
-    }
-
-    Ok(())
-}
-
 /// Core benchmark runner used by both `run` and `ci-quickfix`
 fn run_benchmarks(args: RunArgs) -> Result<()> {
     let dry_run = args.dry_run;
@@ -323,7 +258,6 @@ fn run_benchmarks(args: RunArgs) -> Result<()> {
         dry_run,
         local_analysis,
         dry_run_id: dry_run_id.clone(),
-        run_id: args.run_id,
         route_overrides: args.route_overrides,
     };
 
@@ -624,7 +558,6 @@ fn short_hash(s: &str) -> &str {
 
 fn should_fetch_remote_routes(args: &RunArgs) -> bool {
     args.model_source == ModelSource::Remote
-        && args.models.is_none()
         && args.route_overrides.is_none()
         && !args.dry_run
         && !args.hash_only
@@ -782,6 +715,7 @@ fn filter_routes(config: &RunConfig) -> Vec<ModelRoute> {
                 let already_matched = routes.iter().any(|r| {
                     r.vendor == *vendor
                         && (r.api_model == model_id.as_str()
+                            || r.display_name.to_ascii_lowercase() == model_id.as_str()
                             || r.openrouter_model.as_deref() == Some(model_id.as_str()))
                 });
                 if !already_matched {
@@ -812,13 +746,11 @@ async fn run_many_routes_for_mode(
     let dry_run = config.dry_run;
     let local_analysis = config.local_analysis;
     let dry_run_id = config.dry_run_id.clone();
-    let run_id = config.run_id.clone();
 
     futures::stream::iter(routes.iter().map(|route| {
         let host = host.clone();
         let api_client = api_client.clone();
         let dry_run_id = dry_run_id.clone();
-        let run_id = run_id.clone();
 
         async move {
             println!("\u{2192} running {}", route.display_name);
@@ -837,7 +769,6 @@ async fn run_many_routes_for_mode(
                 dry_run,
                 local_analysis,
                 dry_run_id,
-                run_id,
             };
 
             let outcomes = run_selected_or_all_for_model_async_for_lang(&per).await?;
@@ -1015,7 +946,6 @@ mod tests {
             dry_run: false,
             local_analysis: false,
             route_overrides: None,
-            run_id: None,
         }
     }
 
@@ -1035,13 +965,12 @@ mod tests {
             dry_run: false,
             local_analysis: false,
             dry_run_id: None,
-            run_id: None,
             route_overrides,
         }
     }
 
     #[test]
-    fn remote_model_source_fetches_only_for_implicit_models() {
+    fn remote_model_source_fetches_even_for_explicit_models() {
         let mut args = base_run_args();
         args.model_source = ModelSource::Remote;
         assert!(should_fetch_remote_routes(&args));
@@ -1050,7 +979,7 @@ mod tests {
             vendor: Vendor::OpenAi,
             models: vec!["gpt-test".to_string()],
         }]);
-        assert!(!should_fetch_remote_routes(&args));
+        assert!(should_fetch_remote_routes(&args));
     }
 
     #[test]
@@ -1069,6 +998,27 @@ mod tests {
         assert_eq!(routes[0].api_model, "openai/remote-model");
     }
 
+    #[test]
+    fn filter_routes_does_not_synthesize_duplicate_for_display_name_match() {
+        let remote_route = ModelRoute::new(
+            "DeepSeek V4 Flash",
+            Vendor::DeepSeek,
+            "deepseek-v4-flash",
+            Some("deepseek/deepseek-v4-flash"),
+        );
+        let mut config = base_config(Some(vec![remote_route]));
+        let mut allowed = HashSet::new();
+        allowed.insert("deepseek v4 flash".to_string());
+        let mut filter = HashMap::new();
+        filter.insert(Vendor::DeepSeek, allowed);
+        config.model_filter = Some(filter);
+
+        let routes = filter_routes(&config);
+        assert_eq!(routes.len(), 1);
+        assert_eq!(routes[0].display_name, "DeepSeek V4 Flash");
+        assert_eq!(routes[0].api_model, "deepseek-v4-flash");
+    }
+
     #[test]
     fn category_filter_accepts_full_task_ids() {
         let root = std::env::temp_dir().join(format!(
diff --git a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs
index d8eba39c4d0..83454c2677c 100644
--- a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs
+++ b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs
@@ -82,8 +82,8 @@ impl LlmClient for OpenRouterClient {
         "OpenRouter"
     }
 
-    async fn preflight(&self, _model: &str) -> Result<ClientPreflight> {
-        let status = self.preflight_credits().await?;
+    async fn preflight(&self, model: &str) -> Result<ClientPreflight> {
+        let status = self.preflight_credits(model).await?;
         Ok(ClientPreflight::new(status.summary()))
     }
 
diff --git a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs
index 54e0532db34..61d6998728c 100644
--- a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs
+++ b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs
@@ -36,9 +36,11 @@ impl OpenRouterClient {
         Self { base, api_key, http }
     }
 
-    pub async fn preflight_credits(&self) -> Result<OpenRouterCreditStatus> {
+    pub async fn preflight_credits(&self, model: &str) -> Result<OpenRouterCreditStatus> {
         let key_info = self.fetch_key_info().await?;
         let min_credits = min_credits_threshold();
+        let mut unchecked_allowed = false;
+        let mut model_probe = None;
 
         if let Some(remaining) = key_info.limit_remaining
             && remaining <= min_credits
@@ -69,10 +71,12 @@ impl OpenRouterClient {
         }
 
         if account.is_none() && key_info.limit_remaining.is_none() {
-            bail!(
-                "OpenRouter API key has no configured credit limit and account credits were not checked. \
-                 Set OPENROUTER_MANAGEMENT_API_KEY for account balance preflight."
-            );
+            if allow_unchecked_credits() {
+                unchecked_allowed = true;
+            } else {
+                self.probe_model(model).await?;
+                model_probe = Some(model.to_string());
+            }
         }
 
         Ok(OpenRouterCreditStatus {
@@ -80,6 +84,8 @@ impl OpenRouterClient {
             key_limit_remaining: key_info.limit_remaining,
             account_remaining: account.map(|a| a.remaining),
             min_credits,
+            model_probe,
+            unchecked_allowed,
         })
     }
 
@@ -111,6 +117,51 @@ impl OpenRouterClient {
         })
     }
 
+    async fn probe_model(&self, model: &str) -> Result<()> {
+        let url = format!("{}/chat/completions", self.base.trim_end_matches('/'));
+
+        #[derive(Serialize)]
+        struct Req<'a> {
+            model: &'a str,
+            messages: [Msg<'a>; 1],
+            temperature: f32,
+            max_tokens: u32,
+        }
+
+        #[derive(Serialize)]
+        struct Msg<'a> {
+            role: &'a str,
+            content: &'a str,
+        }
+
+        let req = Req {
+            model,
+            messages: [Msg {
+                role: "user",
+                content: "ping",
+            }],
+            temperature: 0.0,
+            max_tokens: 1,
+        };
+        let auth = HttpClient::bearer(&self.api_key);
+        let body = self
+            .http
+            .post_json(&url, &[auth], &req)
+            .await
+            .with_context(|| format!("OpenRouter model probe failed for '{model}'"))?;
+
+        let resp: serde_json::Value = serde_json::from_str(&body).context("parse OpenRouter probe response")?;
+        if let Some(err) = resp.get("error") {
+            let message = err
+                .get("message")
+                .and_then(|message| message.as_str())
+                .unwrap_or("unknown OpenRouter probe error");
+            bail!("OpenRouter model probe failed for '{}': {}", model, message);
+        }
+
+        Ok(())
+    }
+
     pub async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result<LlmOutput> {
         let url = format!("{}/chat/completions", self.base.trim_end_matches('/'));
 
@@ -207,6 +258,8 @@ pub struct OpenRouterCreditStatus {
     pub key_limit_remaining: Option<f64>,
     pub account_remaining: Option<f64>,
     pub min_credits: f64,
+    pub model_probe: Option<String>,
+    pub unchecked_allowed: bool,
 }
 
 impl OpenRouterCreditStatus {
@@ -218,7 +271,7 @@ impl OpenRouterCreditStatus {
             (None, None) => "key has no configured limit".to_string(),
         };
 
-        match self.account_remaining {
+        let credit_status = match self.account_remaining {
             Some(remaining) => {
                 format!(
                     "{key_remaining}; account remaining {remaining:.4}; min {:.4}",
@@ -229,6 +282,14 @@ impl OpenRouterCreditStatus {
                 "{key_remaining}; account balance not checked (set OPENROUTER_MANAGEMENT_API_KEY); min {:.4}",
                 self.min_credits
             ),
+        };
+
+        if let Some(model) = &self.model_probe {
+            format!("{credit_status}; model probe OK for '{model}'")
+        } else if self.unchecked_allowed {
+            format!("{credit_status}; unchecked credits allowed by OPENROUTER_ALLOW_UNCHECKED_CREDITS")
+        } else {
+            credit_status
         }
     }
 }
@@ -261,12 +322,30 @@ struct OpenRouterAccountCredits {
 }
 
 fn min_credits_threshold() -> f64 {
-    env::var("LLM_MIN_CREDITS")
-        .ok()
-        .and_then(|v| v.trim().parse::<f64>().ok())
+    let openrouter = env::var("OPENROUTER_MIN_CREDITS").ok();
+    let global = env::var("LLM_MIN_CREDITS").ok();
+    parse_min_credits_threshold(openrouter.as_deref(), global.as_deref())
+}
+
+fn allow_unchecked_credits() -> bool {
+    let value = env::var("OPENROUTER_ALLOW_UNCHECKED_CREDITS").ok();
+    parse_env_flag(value.as_deref())
+}
+
+fn parse_min_credits_threshold(openrouter: Option<&str>, global: Option<&str>) -> f64 {
+    [openrouter, global]
+        .into_iter()
+        .flatten()
+        .find_map(|v| v.trim().parse::<f64>().ok())
         .unwrap_or(0.0)
 }
 
+fn parse_env_flag(value: Option<&str>) -> bool {
+    value
+        .map(|v| matches!(v.trim().to_ascii_lowercase().as_str(), "1" | "true" | "yes" | "y"))
+        .unwrap_or(false)
+}
+
 /// Context limits for models accessed via OpenRouter.
 /// Uses the same limits as direct clients where known,
 /// falls back to a conservative default.
@@ -333,3 +412,26 @@ pub fn openrouter_ctx_limit_tokens(model: &str) -> usize {
 
     DEFAULT_CTX_LIMIT
 }
+
+#[cfg(test)]
+mod tests {
+    use super::{parse_env_flag, parse_min_credits_threshold};
+
+    #[test]
+    fn openrouter_min_credits_overrides_global_threshold() {
+        assert_eq!(parse_min_credits_threshold(Some("2.5"), Some("1.0")), 2.5);
+        assert_eq!(parse_min_credits_threshold(None, Some("1.0")), 1.0);
+        assert_eq!(parse_min_credits_threshold(Some("not-a-number"), Some("1.0")), 1.0);
+        assert_eq!(parse_min_credits_threshold(None, None), 0.0);
+    }
+
+    #[test]
+    fn unchecked_credit_escape_hatch_accepts_common_true_values() {
+        for value in ["1", "true", "TRUE", " yes ", "y"] {
+            assert!(parse_env_flag(Some(value)));
+        }
+        for value in [None, Some(""), Some("0"), Some("false"), Some("no")] {
+            assert!(!parse_env_flag(value));
+        }
+    }
+}

From f2179a25897a102da705371e720e7a78e90da1bf Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 10:25:20 -0400
Subject: [PATCH 05/25] weekly goldens; workflow refinements

---
 .github/workflows/llm-benchmark-periodic.yml  | 20 ++++++++++-----
 .../llm-benchmark-validate-goldens.yml        | 24 ++++++++++++------
 .../src/bin/llm_benchmark.rs                  | 25 ++++++++++---------
 3 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml
index 183ba1c0ea9..cf4b57976f8 100644
--- a/.github/workflows/llm-benchmark-periodic.yml
+++ b/.github/workflows/llm-benchmark-periodic.yml
@@ -7,7 +7,7 @@ on:
   workflow_dispatch:
     inputs:
       models:
-        description: 'Models to run (provider:model format, comma-separated, or "all")'
+        description: 'Models to run ("all", or space-separated provider:model groups; each group may contain comma-separated models)'
         required: false
         default: 'all'
       languages:
@@ -26,12 +26,16 @@ on:
         description: 'Optional benchmark task ids/selectors to run (comma-separated)'
         required: false
         default: ''
+      dry_run:
+        description: 'Run benchmarks without uploading results'
+        required: false
+        default: 'false'
 
 permissions:
   contents: read
 
 concurrency:
-  group: llm-benchmark-periodic
+  group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
@@ -40,10 +44,9 @@ jobs:
     timeout-minutes: 180
 
     steps:
-      - name: Checkout master
+      - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          ref: master
           fetch-depth: 1
 
       - uses: dtolnay/rust-toolchain@stable
@@ -101,12 +104,14 @@ jobs:
           INPUT_MODES: ${{ inputs.modes || 'guidelines,no_context' }}
           INPUT_CATEGORIES: ${{ inputs.categories || '' }}
           INPUT_TASKS: ${{ inputs.tasks || '' }}
+          INPUT_DRY_RUN: ${{ inputs.dry_run || 'false' }}
         run: |
           LANGS="$INPUT_LANGUAGES"
           MODELS="$INPUT_MODELS"
           MODES="$INPUT_MODES"
           CATEGORIES="$INPUT_CATEGORIES"
           TASKS="$INPUT_TASKS"
+          DRY_RUN="$INPUT_DRY_RUN"
 
           SUCCEEDED=0
           FAILED=0
@@ -118,6 +123,9 @@ jobs:
             if [ -n "$TASKS" ]; then
               EXTRA_ARGS+=(--tasks "$TASKS")
             fi
+            if [ "$DRY_RUN" = "true" ]; then
+              EXTRA_ARGS+=(--dry-run)
+            fi
 
             if [ "$MODELS" = "all" ]; then
               if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote "${EXTRA_ARGS[@]}"; then
@@ -137,7 +145,7 @@ jobs:
             fi
           done
           echo "Benchmark runs: $SUCCEEDED succeeded, $FAILED failed"
-          if [ "$SUCCEEDED" -eq 0 ] && [ "$FAILED" -gt 0 ]; then
-            echo "::error::All benchmark runs failed"
+          if [ "$FAILED" -gt 0 ]; then
+            echo "::error::$FAILED benchmark run(s) failed"
             exit 1
           fi
diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml
index 591d55a6a59..cb4c532833e 100644
--- a/.github/workflows/llm-benchmark-validate-goldens.yml
+++ b/.github/workflows/llm-benchmark-validate-goldens.yml
@@ -2,15 +2,26 @@ name: Validate LLM benchmark golden answers
 
 on:
   schedule:
-    # Nightly at 2 AM UTC
-    - cron: '0 2 * * *'
-  workflow_dispatch: {}
+    # Weekly on Monday at 2 AM UTC.
+    - cron: '0 2 * * 1'
+  workflow_dispatch:
+    inputs:
+      lang:
+        description: 'Language to validate for manual smoke runs'
+        required: false
+        type: choice
+        default: all
+        options:
+          - all
+          - rust
+          - csharp
+          - typescript
 
 permissions:
   contents: read
 
 concurrency:
-  group: llm-benchmark-validate-goldens
+  group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
@@ -21,13 +32,12 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        lang: [rust, csharp, typescript]
+        lang: ${{ fromJSON(github.event_name == 'workflow_dispatch' && inputs.lang != 'all' && format('["{0}"]', inputs.lang) || '["rust","csharp","typescript"]') }}
 
     steps:
-      - name: Checkout master
+      - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          ref: master
           fetch-depth: 1
 
       - uses: dtolnay/rust-toolchain@stable
diff --git a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
index 0d6d1f7374f..72b551a9540 100644
--- a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
+++ b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
@@ -233,13 +233,15 @@ fn run_benchmarks(args: RunArgs) -> Result<()> {
     let dry_run_id = dry_run.then(|| chrono::Utc::now().format("%Y-%m-%d_%H%M%S").to_string());
     let should_fetch_remote_routes = should_fetch_remote_routes(&args);
 
-    let api_client = if dry_run {
-        None
-    } else {
+    let needs_api_client = should_fetch_remote_routes || !dry_run;
+    let api_client = if needs_api_client {
         ApiClient::from_env().context("failed to initialize API client")?
+    } else {
+        None
     };
+    let upload_client = if dry_run { None } else { api_client.clone() };
 
-    if api_client.is_none() && !dry_run {
+    if upload_client.is_none() && !dry_run {
         eprintln!("[warn] LLM_BENCHMARK_UPLOAD_URL not set; results will not be uploaded");
     }
 
@@ -254,7 +256,7 @@ fn run_benchmarks(args: RunArgs) -> Result<()> {
         categories: categories_to_set(args.categories),
         model_filter: model_filter_from_groups(args.models),
         host: None,
-        api_client: api_client.clone(),
+        api_client: upload_client.clone(),
         dry_run,
         local_analysis,
         dry_run_id: dry_run_id.clone(),
@@ -271,7 +273,7 @@ fn run_benchmarks(args: RunArgs) -> Result<()> {
     let bench_root = find_bench_root();
 
     // Upload task catalog before running benchmarks
-    if let Some(ref api) = api_client
+    if let Some(ref api) = upload_client
         && let Err(e) = api.upload_task_catalog(&bench_root)
     {
         eprintln!("[warn] failed to upload task catalog: {e}");
@@ -557,11 +559,7 @@ fn short_hash(s: &str) -> &str {
 }
 
 fn should_fetch_remote_routes(args: &RunArgs) -> bool {
-    args.model_source == ModelSource::Remote
-        && args.route_overrides.is_none()
-        && !args.dry_run
-        && !args.hash_only
-        && !args.goldens_only
+    args.model_source == ModelSource::Remote && args.route_overrides.is_none() && !args.hash_only && !args.goldens_only
 }
 
 fn preflight_llm_routes(
@@ -970,7 +968,7 @@ mod tests {
     }
 
     #[test]
-    fn remote_model_source_fetches_even_for_explicit_models() {
+    fn remote_model_source_fetches_for_all_model_selection_paths() {
         let mut args = base_run_args();
         args.model_source = ModelSource::Remote;
         assert!(should_fetch_remote_routes(&args));
@@ -980,6 +978,9 @@ mod tests {
             models: vec!["gpt-test".to_string()],
         }]);
         assert!(should_fetch_remote_routes(&args));
+
+        args.dry_run = true;
+        assert!(should_fetch_remote_routes(&args));
     }
 
     #[test]

From 8d1d27ec4c9e20e678b02eda4a6f7f25f68e0484 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 11:13:30 -0400
Subject: [PATCH 06/25] Update publishers.rs

---
 .../src/bench/publishers.rs                   | 251 +++++++++++++-----
 1 file changed, 190 insertions(+), 61 deletions(-)

diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs
index 55b8a98d5b5..7ba383dbc06 100644
--- a/tools/xtask-llm-benchmark/src/bench/publishers.rs
+++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs
@@ -31,6 +31,121 @@ fn pnpm_minimum_release_age() -> Result<String> {
         .ok_or_else(|| anyhow::anyhow!("pnpm-workspace.yaml is missing minimumReleaseAge"))
 }
 
+fn path_entries() -> Vec<PathBuf> {
+    #[cfg(windows)]
+    let path = env::var_os("Path").or_else(|| env::var_os("PATH"));
+    #[cfg(not(windows))]
+    let path = env::var_os("PATH");
+
+    path.map(|path| env::split_paths(&path).collect()).unwrap_or_default()
+}
+
+fn command_path_candidates(name: &str) -> Vec<String> {
+    #[cfg(windows)]
+    {
+        let path = Path::new(name);
+        if path.extension().is_some() {
+            vec![name.to_string()]
+        } else {
+            vec![
+                format!("{name}.cmd"),
+                format!("{name}.exe"),
+                format!("{name}.bat"),
+                name.to_string(),
+            ]
+        }
+    }
+    #[cfg(not(windows))]
+    {
+        vec![name.to_string()]
+    }
+}
+
+fn resolve_command_on_path(name: &str) -> Option<PathBuf> {
+    for dir in path_entries() {
+        for candidate in command_path_candidates(name) {
+            let path = dir.join(candidate);
+            if path.is_file() {
+                return Some(path);
+            }
+        }
+    }
+    None
+}
+
+fn configured_nodejs_dir() -> Option<PathBuf> {
+    env::var("NODEJS_DIR")
+        .ok()
+        .map(|s| s.trim().trim_matches('"').trim().to_string())
+        .filter(|s| !s.is_empty())
+        .map(PathBuf::from)
+}
+
+fn pnpm_in_dir(dir: &Path) -> Option<PathBuf> {
+    #[cfg(windows)]
+    {
+        for candidate in ["pnpm.cmd", "pnpm.exe", "pnpm.bat"] {
+            let path = dir.join(candidate);
+            if path.is_file() {
+                return Some(path);
+            }
+        }
+        None
+    }
+    #[cfg(not(windows))]
+    {
+        let path = dir.join("pnpm");
+        path.is_file().then_some(path)
+    }
+}
+
+fn node_in_dir(dir: &Path) -> Option<PathBuf> {
+    #[cfg(windows)]
+    let path = dir.join("node.exe");
+    #[cfg(not(windows))]
+    let path = dir.join("node");
+
+    path.is_file().then_some(path)
+}
+
+fn resolve_node_exe(nodejs_dir: Option<&Path>) -> Option<PathBuf> {
+    nodejs_dir
+        .and_then(node_in_dir)
+        .or_else(|| resolve_command_on_path("node"))
+        .or_else(|| {
+            env::var("NVM_SYMLINK")
+                .ok()
+                .map(PathBuf::from)
+                .and_then(|dir| node_in_dir(&dir))
+        })
+}
+
+fn pnpm_cjs_for_cmd(pnpm: &Path) -> Option<PathBuf> {
+    #[cfg(windows)]
+    {
+        let is_cmd = pnpm
+            .extension()
+            .and_then(|ext| ext.to_str())
+            .is_some_and(|ext| ext.eq_ignore_ascii_case("cmd"));
+        if !is_cmd {
+            return None;
+        }
+
+        let cjs = pnpm
+            .parent()?
+            .join("node_modules")
+            .join("pnpm")
+            .join("bin")
+            .join("pnpm.cjs");
+        cjs.is_file().then_some(cjs)
+    }
+    #[cfg(not(windows))]
+    {
+        let _ = pnpm;
+        None
+    }
+}
+
 /// Strip ANSI escape codes (color codes) from a string
 fn strip_ansi_codes(s: &str) -> Cow<'_, str> {
     static ANSI_RE: LazyLock<Regex> = LazyLock::new(|| {
@@ -275,49 +390,31 @@ impl Publisher for TypeScriptPublisher {
         let db = sanitize_db_name(module_name);
 
         // Install dependencies (--ignore-workspace to avoid parent workspace interference).
-        // If NODEJS_DIR is set (e.g. nvm4w on Windows), use full path to pnpm so spawn finds it.
-        let pnpm_exe = env::var("NODEJS_DIR")
-            .ok()
-            .map(|s| s.trim().trim_matches('"').trim().to_string())
-            .filter(|s| !s.is_empty())
-            .map(PathBuf::from)
-            .and_then(|dir| {
-                #[cfg(windows)]
-                {
-                    let pnpm_cmd = dir.join("pnpm.cmd");
-                    let pnpm_exe_path = dir.join("pnpm.exe");
-                    if pnpm_cmd.is_file() {
-                        eprintln!("[pnpm] using NODEJS_DIR: {} (pnpm.cmd)", dir.display());
-                        Some(pnpm_cmd)
-                    } else if pnpm_exe_path.is_file() {
-                        eprintln!("[pnpm] using NODEJS_DIR: {} (pnpm.exe)", dir.display());
-                        Some(pnpm_exe_path)
-                    } else {
-                        eprintln!(
-                            "[pnpm] NODEJS_DIR set to {} but pnpm.cmd/pnpm.exe not found there, using PATH",
-                            dir.display()
-                        );
-                        None
-                    }
-                }
-                #[cfg(not(windows))]
-                {
-                    let pnpm = dir.join("pnpm");
-                    if pnpm.is_file() {
-                        eprintln!("[pnpm] using NODEJS_DIR: {} (pnpm)", dir.display());
-                        Some(pnpm)
-                    } else {
-                        eprintln!(
-                            "[pnpm] NODEJS_DIR set to {} but pnpm not found there, using PATH",
-                            dir.display()
-                        );
-                        None
-                    }
-                }
-            });
-        let mut pnpm_cmd = match &pnpm_exe {
-            Some(p) => Command::new(p),
-            None => Command::new("pnpm"),
+        let nodejs_dir = configured_nodejs_dir();
+        let pnpm_exe = nodejs_dir
+            .as_deref()
+            .and_then(pnpm_in_dir)
+            .or_else(|| resolve_command_on_path("pnpm"));
+        if let Some(ref pnpm) = pnpm_exe {
+            eprintln!("[pnpm] using {}", pnpm.display());
+        } else if let Some(ref dir) = nodejs_dir {
+            eprintln!(
+                "[pnpm] NODEJS_DIR set to {} but pnpm not found there or on PATH",
+                dir.display()
+            );
+        }
+        let node_exe = resolve_node_exe(nodejs_dir.as_deref());
+        let pnpm_cjs = pnpm_exe.as_deref().and_then(pnpm_cjs_for_cmd);
+        let mut pnpm_cmd = if let (Some(node), Some(cjs)) = (&node_exe, pnpm_cjs) {
+            eprintln!("[pnpm] invoking {} {}", node.display(), cjs.display());
+            let mut cmd = Command::new(node);
+            cmd.arg(cjs);
+            cmd
+        } else {
+            match &pnpm_exe {
+                Some(p) => Command::new(p),
+                None => Command::new("pnpm"),
+            }
         };
         pnpm_cmd
             .arg("install")
@@ -327,30 +424,62 @@ impl Publisher for TypeScriptPublisher {
             // This install runs in a materialized project with workspace config
             // ignored, so pass the repo's pnpm package-age policy explicitly.
             .env("npm_config_minimum_release_age", pnpm_minimum_release_age()?);
-        // When using NODEJS_DIR, prepend it to PATH so pnpm.cmd can find node.
-        if let Some(ref dir) = pnpm_exe
-            && let Some(parent) = dir.parent()
+        let mut prepend_paths = Vec::new();
+        if let Some(dir) = nodejs_dir {
+            prepend_paths.push(dir);
+        }
+        if let Some(ref pnpm) = pnpm_exe
+            && let Some(parent) = pnpm.parent()
         {
-            let mut paths: Vec<PathBuf> = env::split_paths(&env::var("PATH").unwrap_or_default()).collect();
-            paths.insert(0, parent.to_path_buf());
-            if let Ok(new_path) = env::join_paths(paths) {
-                pnpm_cmd.env("PATH", new_path);
+            prepend_paths.push(parent.to_path_buf());
+        }
+        if let Some(node) = node_exe
+            && let Some(parent) = node.parent()
+        {
+            prepend_paths.push(parent.to_path_buf());
+        }
+        let child_path = if !prepend_paths.is_empty() {
+            let mut paths = path_entries();
+            for path in prepend_paths.into_iter().rev() {
+                if !paths.iter().any(|existing| existing == &path) {
+                    paths.insert(0, path);
+                }
+            }
+            env::join_paths(paths).ok()
+        } else {
+            None
+        };
+        if let Some(ref new_path) = child_path {
+            #[cfg(windows)]
+            {
+                pnpm_cmd.env_remove("PATH");
+                pnpm_cmd.env("Path", new_path);
             }
+            #[cfg(not(windows))]
+            pnpm_cmd.env("PATH", new_path);
         }
         run(&mut pnpm_cmd, "pnpm install (typescript)")?;
 
         // Publish (spacetime CLI handles TypeScript compilation internally)
-        run(
-            Command::new("spacetime")
-                .arg("publish")
-                .arg("-c")
-                .arg("-y")
-                .arg("--server")
-                .arg(host_url)
-                .arg(&db)
-                .current_dir(source),
-            "spacetime publish (typescript)",
-        )?;
+        let mut publish_cmd = Command::new("spacetime");
+        publish_cmd
+            .arg("publish")
+            .arg("-c")
+            .arg("-y")
+            .arg("--server")
+            .arg(host_url)
+            .arg(&db)
+            .current_dir(source);
+        if let Some(ref new_path) = child_path {
+            #[cfg(windows)]
+            {
+                publish_cmd.env_remove("PATH");
+                publish_cmd.env("Path", new_path);
+            }
+            #[cfg(not(windows))]
+            publish_cmd.env("PATH", new_path);
+        }
+        run(&mut publish_cmd, "spacetime publish (typescript)")?;
 
         Ok(())
     }

From d5957f26603d5c45eb874b991883b86af2d31d91 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 13:12:46 -0400
Subject: [PATCH 07/25] golden fixes

---
 .../queries/t_037_multi_column_filter/answers/typescript.ts   | 2 +-
 .../benchmarks/schema/t_018_constraints/answers/typescript.ts | 2 +-
 .../schema/t_019_many_to_many/answers/typescript.ts           | 4 ++--
 .../schema/t_021_multi_column_index/answers/typescript.ts     | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts
index 26c7dc9b230..1ba8ca175d1 100644
--- a/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts
+++ b/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts
@@ -2,7 +2,7 @@ import { schema, table, t } from 'spacetimedb/server';
 
 const eventLog = table({
   name: 'event_log',
-  indexes: [{ name: 'byCategorySeverity', algorithm: 'btree', columns: ['category', 'severity'] }],
+  indexes: [{ accessor: 'byCategorySeverity', algorithm: 'btree', columns: ['category', 'severity'] }],
 }, {
   id: t.u64().primaryKey().autoInc(),
   category: t.string(),
diff --git a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts
index 50d9f9c1dae..d23dead5a96 100644
--- a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts
+++ b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts
@@ -2,7 +2,7 @@ import { table, schema, t } from 'spacetimedb/server';
 
 const account = table({
   name: 'account',
-  indexes: [{ name: 'byName', algorithm: 'btree', columns: ['name'] }],
+  indexes: [{ accessor: 'byName', algorithm: 'btree', columns: ['name'] }],
 }, {
   id: t.u64().primaryKey().autoInc(),
   email: t.string().unique(),
diff --git a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts
index d7629137dcc..4ab152504d1 100644
--- a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts
+++ b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts
@@ -24,8 +24,8 @@ const membership = table(
   {
     name: 'membership',
     indexes: [
-      { name: 'byUser', algorithm: 'btree', columns: ['userId'] },
-      { name: 'byGroup', algorithm: 'btree', columns: ['groupId'] },
+      { accessor: 'byUser', algorithm: 'btree', columns: ['userId'] },
+      { accessor: 'byGroup', algorithm: 'btree', columns: ['groupId'] },
     ],
   },
   {
diff --git a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts
index 5d5fb568d7b..2f237fb0151 100644
--- a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts
+++ b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts
@@ -2,7 +2,7 @@ import { table, schema, t } from 'spacetimedb/server';
 
 const log = table({
   name: 'log',
-  indexes: [{ name: 'byUserDay', algorithm: 'btree', columns: ['userId', 'day'] }],
+  indexes: [{ accessor: 'byUserDay', algorithm: 'btree', columns: ['userId', 'day'] }],
 }, {
   id: t.u64().primaryKey().autoInc(),
   userId: t.i32(),

From 4c679e2ea9827e4fd3c702e7d7bd7c64b0111981 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 14:58:54 -0400
Subject: [PATCH 08/25] fixes

---
 .github/workflows/llm-benchmark-periodic.yml  | 55 +++++++++++++++++--
 .../src/bin/llm_benchmark.rs                  | 12 ++--
 .../src/llm/clients/openrouter.rs             |  2 +-
 3 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml
index cf4b57976f8..6673b541cde 100644
--- a/.github/workflows/llm-benchmark-periodic.yml
+++ b/.github/workflows/llm-benchmark-periodic.yml
@@ -6,10 +6,19 @@ on:
     - cron: '0 0 * * 1'
   workflow_dispatch:
     inputs:
+      model_set:
+        description: 'Model set to run'
+        required: false
+        type: choice
+        options:
+          - website_active
+          - local_defaults
+          - explicit
+        default: website_active
       models:
-        description: 'Models to run ("all", or space-separated provider:model groups; each group may contain comma-separated models)'
+        description: 'Space-separated provider:model groups. Required when model_set=explicit.'
         required: false
-        default: 'all'
+        default: ''
       languages:
         description: 'Languages to benchmark (comma-separated: rust,csharp,typescript)'
         required: false
@@ -100,19 +109,47 @@ jobs:
           DOTNET_CLI_USE_MSBUILD_SERVER: "0"
           LLM_BENCH_CSHARP_CONCURRENCY: "1"
           INPUT_LANGUAGES: ${{ inputs.languages || 'rust,csharp,typescript' }}
-          INPUT_MODELS: ${{ inputs.models || 'all' }}
+          INPUT_MODEL_SET: ${{ inputs.model_set || 'website_active' }}
+          INPUT_MODELS: ${{ inputs.models || '' }}
           INPUT_MODES: ${{ inputs.modes || 'guidelines,no_context' }}
           INPUT_CATEGORIES: ${{ inputs.categories || '' }}
           INPUT_TASKS: ${{ inputs.tasks || '' }}
           INPUT_DRY_RUN: ${{ inputs.dry_run || 'false' }}
         run: |
           LANGS="$INPUT_LANGUAGES"
+          MODEL_SET="$INPUT_MODEL_SET"
           MODELS="$INPUT_MODELS"
           MODES="$INPUT_MODES"
           CATEGORIES="$INPUT_CATEGORIES"
           TASKS="$INPUT_TASKS"
           DRY_RUN="$INPUT_DRY_RUN"
 
+          case "$MODEL_SET" in
+            website_active)
+              if [ -n "$MODELS" ]; then
+                echo "::error::models is only valid when model_set=explicit"
+                exit 1
+              fi
+              ;;
+            local_defaults)
+              if [ -n "$MODELS" ]; then
+                echo "::error::models is only valid when model_set=explicit"
+                exit 1
+              fi
+              ;;
+            explicit)
+              if [ -z "$MODELS" ]; then
+                echo "::error::models is required when model_set=explicit"
+                exit 1
+              fi
+              read -r -a MODEL_ARGS <<< "$MODELS"
+              ;;
+            *)
+              echo "::error::unknown model_set '$MODEL_SET' (expected website_active, local_defaults, or explicit)"
+              exit 1
+              ;;
+          esac
+
           SUCCEEDED=0
           FAILED=0
           for LANG in $(echo "$LANGS" | tr ',' ' '); do
@@ -127,16 +164,22 @@ jobs:
               EXTRA_ARGS+=(--dry-run)
             fi
 
-            if [ "$MODELS" = "all" ]; then
+            if [ "$MODEL_SET" = "website_active" ]; then
               if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote "${EXTRA_ARGS[@]}"; then
                 SUCCEEDED=$((SUCCEEDED + 1))
               else
                 echo "::warning::Benchmark run failed for lang=$LANG"
                 FAILED=$((FAILED + 1))
               fi
+            elif [ "$MODEL_SET" = "local_defaults" ]; then
+              if llm_benchmark run --lang "$LANG" --modes "$MODES" "${EXTRA_ARGS[@]}"; then
+                SUCCEEDED=$((SUCCEEDED + 1))
+              else
+                echo "::warning::Benchmark run failed for lang=$LANG"
+                FAILED=$((FAILED + 1))
+              fi
             else
-              read -r -a MODEL_ARGS <<< "$MODELS"
-              if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote --models "${MODEL_ARGS[@]}" "${EXTRA_ARGS[@]}"; then
+              if llm_benchmark run --lang "$LANG" --modes "$MODES" --models "${MODEL_ARGS[@]}" "${EXTRA_ARGS[@]}"; then
                 SUCCEEDED=$((SUCCEEDED + 1))
               else
                 echo "::warning::Benchmark run failed for lang=$LANG models=$MODELS"
diff --git a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
index 72b551a9540..179be601634 100644
--- a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
+++ b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
@@ -559,7 +559,11 @@ fn short_hash(s: &str) -> &str {
 }
 
 fn should_fetch_remote_routes(args: &RunArgs) -> bool {
-    args.model_source == ModelSource::Remote && args.route_overrides.is_none() && !args.hash_only && !args.goldens_only
+    args.model_source == ModelSource::Remote
+        && args.models.is_none()
+        && args.route_overrides.is_none()
+        && !args.hash_only
+        && !args.goldens_only
 }
 
 fn preflight_llm_routes(
@@ -968,7 +972,7 @@ mod tests {
     }
 
     #[test]
-    fn remote_model_source_fetches_for_all_model_selection_paths() {
+    fn explicit_models_bypass_remote_model_source() {
         let mut args = base_run_args();
         args.model_source = ModelSource::Remote;
         assert!(should_fetch_remote_routes(&args));
@@ -977,10 +981,10 @@ mod tests {
             vendor: Vendor::OpenAi,
             models: vec!["gpt-test".to_string()],
         }]);
-        assert!(should_fetch_remote_routes(&args));
+        assert!(!should_fetch_remote_routes(&args));
 
         args.dry_run = true;
-        assert!(should_fetch_remote_routes(&args));
+        assert!(!should_fetch_remote_routes(&args));
     }
 
     #[test]
diff --git a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs
index 61d6998728c..8e8642ada0b 100644
--- a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs
+++ b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs
@@ -141,7 +141,7 @@ impl OpenRouterClient {
                 content: "ping",
             }],
             temperature: 0.0,
-            max_tokens: 1,
+            max_tokens: 16,
         };
         let auth = HttpClient::bearer(&self.api_key);
         let body = self

From 4358ed59a83df45326c2593e8ae0bc0c235b7f0e Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 16:22:57 -0400
Subject: [PATCH 09/25] Update publishers.rs

---
 .../src/bench/publishers.rs                   | 134 ++++++++++++++----
 1 file changed, 109 insertions(+), 25 deletions(-)

diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs
index 7ba383dbc06..f82d617ae5f 100644
--- a/tools/xtask-llm-benchmark/src/bench/publishers.rs
+++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs
@@ -6,7 +6,11 @@ use std::env;
 use std::fs;
 use std::path::{Path, PathBuf};
 use std::process::Command;
-use std::sync::LazyLock;
+use std::sync::{
+    atomic::{AtomicU64, Ordering},
+    LazyLock,
+};
+use std::time::{SystemTime, UNIX_EPOCH};
 
 fn workspace_root() -> PathBuf {
     PathBuf::from(env!("CARGO_MANIFEST_DIR"))
@@ -120,6 +124,48 @@ fn resolve_node_exe(nodejs_dir: Option<&Path>) -> Option<PathBuf> {
         })
 }
 
+struct CliRootDir {
+    path: PathBuf,
+}
+
+impl CliRootDir {
+    fn path(&self) -> &Path {
+        &self.path
+    }
+}
+
+impl Drop for CliRootDir {
+    fn drop(&mut self) {
+        let _ = fs::remove_dir_all(&self.path);
+    }
+}
+
+fn isolated_cli_root() -> Result<CliRootDir> {
+    static COUNTER: AtomicU64 = AtomicU64::new(0);
+
+    for _ in 0..16 {
+        let nanos = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .map(|duration| duration.as_nanos())
+            .unwrap_or(0);
+        let id = COUNTER.fetch_add(1, Ordering::Relaxed);
+        let path = env::temp_dir().join(format!("stdb-llm-cli-{}-{nanos}-{id}", std::process::id()));
+        match fs::create_dir(&path) {
+            Ok(()) => return Ok(CliRootDir { path }),
+            Err(error) if error.kind() == std::io::ErrorKind::AlreadyExists => continue,
+            Err(error) => return Err(error.into()),
+        }
+    }
+
+    bail!("failed to create isolated SpacetimeDB CLI root directory");
+}
+
+fn spacetime_cmd(cli_root: &CliRootDir) -> Command {
+    let mut cmd = Command::new("spacetime");
+    cmd.arg("--root-dir").arg(cli_root.path());
+    cmd
+}
+
 fn pnpm_cjs_for_cmd(pnpm: &Path) -> Option<PathBuf> {
     #[cfg(windows)]
     {
@@ -279,6 +325,36 @@ impl DotnetPublisher {
         }
         Ok(())
     }
+
+    fn configure_dotnet_env(cmd: &mut Command) -> &mut Command {
+        cmd.env("DOTNET_CLI_TELEMETRY_OPTOUT", "1")
+            .env("DOTNET_NOLOGO", "1")
+            // Prevent MSBuild node reuse issues that cause "Pipe is broken" errors
+            // when running multiple dotnet builds in parallel.
+            .env("MSBUILDDISABLENODEREUSE", "1")
+            .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0")
+    }
+
+    fn built_wasm(root: &Path, config_name: &str) -> Result<PathBuf> {
+        let subdir = if env::var_os("EXPERIMENTAL_WASM_AOT").is_some_and(|value| value == "1") {
+            "publish"
+        } else {
+            "AppBundle"
+        };
+        let candidates = [
+            root.join(format!("bin/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")),
+            root.join(format!("bin~/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")),
+        ];
+
+        let mut found = candidates.iter().filter(|path| path.exists());
+        let Some(path) = found.next() else {
+            bail!("dotnet publish succeeded but StdbModule.wasm was not found in bin or bin~");
+        };
+        if found.next().is_some() {
+            bail!("dotnet publish produced both bin and bin~ outputs; cannot choose the C# wasm");
+        }
+        Ok(path.to_path_buf())
+    }
 }
 
 impl Publisher for DotnetPublisher {
@@ -288,30 +364,36 @@ impl Publisher for DotnetPublisher {
         }
         println!("publish csharp module {}", module_name);
 
-        Self::ensure_csproj(source)?;
+        let source = fs::canonicalize(source)?;
+        Self::ensure_csproj(&source)?;
 
         let db = sanitize_db_name(module_name);
+        let cli_root = isolated_cli_root()?;
 
-        let mut cmd = Command::new("spacetime");
-        cmd.arg("build")
-            .current_dir(source)
-            .env("DOTNET_CLI_TELEMETRY_OPTOUT", "1")
-            .env("DOTNET_NOLOGO", "1")
-            // Prevent MSBuild node reuse issues that cause "Pipe is broken" errors
-            // when running multiple dotnet builds in parallel.
-            .env("MSBUILDDISABLENODEREUSE", "1")
-            .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0");
-        run(&mut cmd, "spacetime build (csharp)")?;
-
-        let mut pubcmd = Command::new("spacetime");
+        let config_name = "Release";
+        let mut build_cmd = Command::new("dotnet");
+        build_cmd
+            .arg("publish")
+            .arg("-c")
+            .arg(config_name)
+            .arg("-v")
+            .arg("quiet")
+            .current_dir(&source);
+        Self::configure_dotnet_env(&mut build_cmd);
+        run(&mut build_cmd, "dotnet publish (csharp)")?;
+        let wasm = Self::built_wasm(&source, config_name)?;
+
+        let mut pubcmd = spacetime_cmd(&cli_root);
         pubcmd
             .arg("publish")
             .arg("-c")
             .arg("-y")
             .arg("--server")
             .arg(host_url)
+            .arg("--bin-path")
+            .arg(wasm)
             .arg(&db)
-            .current_dir(source);
+            .current_dir(&source);
         run(&mut pubcmd, "spacetime publish (csharp)")?;
 
         Ok(())
@@ -345,10 +427,11 @@ impl Publisher for SpacetimeRustPublisher {
 
         // sanitize db + server
         let db = sanitize_db_name(module_name);
+        let cli_root = isolated_cli_root()?;
 
         // 2) Publish
         run(
-            Command::new("spacetime")
+            spacetime_cmd(&cli_root)
                 .arg("publish")
                 .arg("-c")
                 .arg("-y")
@@ -388,6 +471,7 @@ impl Publisher for TypeScriptPublisher {
 
         Self::ensure_package_json(source)?;
         let db = sanitize_db_name(module_name);
+        let cli_root = isolated_cli_root()?;
 
         // Install dependencies (--ignore-workspace to avoid parent workspace interference).
         let nodejs_dir = configured_nodejs_dir();
@@ -428,15 +512,15 @@ impl Publisher for TypeScriptPublisher {
         if let Some(dir) = nodejs_dir {
             prepend_paths.push(dir);
         }
-        if let Some(ref pnpm) = pnpm_exe
-            && let Some(parent) = pnpm.parent()
-        {
-            prepend_paths.push(parent.to_path_buf());
+        if let Some(ref pnpm) = pnpm_exe {
+            if let Some(parent) = pnpm.parent() {
+                prepend_paths.push(parent.to_path_buf());
+            }
         }
-        if let Some(node) = node_exe
-            && let Some(parent) = node.parent()
-        {
-            prepend_paths.push(parent.to_path_buf());
+        if let Some(node) = node_exe {
+            if let Some(parent) = node.parent() {
+                prepend_paths.push(parent.to_path_buf());
+            }
         }
         let child_path = if !prepend_paths.is_empty() {
             let mut paths = path_entries();
@@ -461,7 +545,7 @@ impl Publisher for TypeScriptPublisher {
         run(&mut pnpm_cmd, "pnpm install (typescript)")?;
 
         // Publish (spacetime CLI handles TypeScript compilation internally)
-        let mut publish_cmd = Command::new("spacetime");
+        let mut publish_cmd = spacetime_cmd(&cli_root);
         publish_cmd
             .arg("publish")
             .arg("-c")

From 890be1885a21df20f0e4b6aef7b7a9ffd9c8146b Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 16:39:42 -0400
Subject: [PATCH 10/25] updates

---
 .github/workflows/llm-benchmark-periodic.yml         | 3 +++
 .github/workflows/llm-benchmark-validate-goldens.yml | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml
index 6673b541cde..290cfbee325 100644
--- a/.github/workflows/llm-benchmark-periodic.yml
+++ b/.github/workflows/llm-benchmark-periodic.yml
@@ -105,6 +105,9 @@ jobs:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           LLM_BENCHMARK_API_KEY: ${{ secrets.LLM_BENCHMARK_API_KEY }}
           LLM_BENCHMARK_UPLOAD_URL: ${{ secrets.LLM_BENCHMARK_UPLOAD_URL }}
+          DOTNET_MULTILEVEL_LOOKUP: "0"
+          DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home
+          DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1"
           MSBUILDDISABLENODEREUSE: "1"
           DOTNET_CLI_USE_MSBUILD_SERVER: "0"
           LLM_BENCH_CSHARP_CONCURRENCY: "1"
diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml
index cb4c532833e..a5199cb0bfe 100644
--- a/.github/workflows/llm-benchmark-validate-goldens.yml
+++ b/.github/workflows/llm-benchmark-validate-goldens.yml
@@ -87,6 +87,9 @@ jobs:
 
       - name: Validate golden answers (${{ matrix.lang }})
         env:
+          DOTNET_MULTILEVEL_LOOKUP: "0"
+          DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home
+          DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1"
           MSBUILDDISABLENODEREUSE: "1"
           DOTNET_CLI_USE_MSBUILD_SERVER: "0"
           LLM_BENCH_CSHARP_CONCURRENCY: "1"

From 480cedf0ba60839177cc2d17fd530f9d08b58354 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 16:44:08 -0400
Subject: [PATCH 11/25] Update publishers.rs

---
 .../src/bench/publishers.rs                   | 45 +++----------------
 1 file changed, 7 insertions(+), 38 deletions(-)

diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs
index f82d617ae5f..18d07dfa42b 100644
--- a/tools/xtask-llm-benchmark/src/bench/publishers.rs
+++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs
@@ -334,27 +334,6 @@ impl DotnetPublisher {
             .env("MSBUILDDISABLENODEREUSE", "1")
             .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0")
     }
-
-    fn built_wasm(root: &Path, config_name: &str) -> Result<PathBuf> {
-        let subdir = if env::var_os("EXPERIMENTAL_WASM_AOT").is_some_and(|value| value == "1") {
-            "publish"
-        } else {
-            "AppBundle"
-        };
-        let candidates = [
-            root.join(format!("bin/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")),
-            root.join(format!("bin~/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")),
-        ];
-
-        let mut found = candidates.iter().filter(|path| path.exists());
-        let Some(path) = found.next() else {
-            bail!("dotnet publish succeeded but StdbModule.wasm was not found in bin or bin~");
-        };
-        if found.next().is_some() {
-            bail!("dotnet publish produced both bin and bin~ outputs; cannot choose the C# wasm");
-        }
-        Ok(path.to_path_buf())
-    }
 }
 
 impl Publisher for DotnetPublisher {
@@ -364,24 +343,15 @@ impl Publisher for DotnetPublisher {
         }
         println!("publish csharp module {}", module_name);
 
-        let source = fs::canonicalize(source)?;
-        Self::ensure_csproj(&source)?;
+        Self::ensure_csproj(source)?;
 
         let db = sanitize_db_name(module_name);
         let cli_root = isolated_cli_root()?;
 
-        let config_name = "Release";
-        let mut build_cmd = Command::new("dotnet");
-        build_cmd
-            .arg("publish")
-            .arg("-c")
-            .arg(config_name)
-            .arg("-v")
-            .arg("quiet")
-            .current_dir(&source);
-        Self::configure_dotnet_env(&mut build_cmd);
-        run(&mut build_cmd, "dotnet publish (csharp)")?;
-        let wasm = Self::built_wasm(&source, config_name)?;
+        let mut cmd = spacetime_cmd(&cli_root);
+        cmd.arg("build").current_dir(source);
+        Self::configure_dotnet_env(&mut cmd);
+        run(&mut cmd, "spacetime build (csharp)")?;
 
         let mut pubcmd = spacetime_cmd(&cli_root);
         pubcmd
@@ -390,10 +360,9 @@ impl Publisher for DotnetPublisher {
             .arg("-y")
             .arg("--server")
             .arg(host_url)
-            .arg("--bin-path")
-            .arg(wasm)
             .arg(&db)
-            .current_dir(&source);
+            .current_dir(source);
+        Self::configure_dotnet_env(&mut pubcmd);
         run(&mut pubcmd, "spacetime publish (csharp)")?;
 
         Ok(())

From d4999e2e21bd3193ca1d579727d7986e8f50b52c Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 17:12:28 -0400
Subject: [PATCH 12/25] fixes

---
 .github/workflows/llm-benchmark-periodic.yml  |  2 +-
 .../llm-benchmark-validate-goldens.yml        |  2 +-
 .../src/bench/publishers.rs                   | 48 +++++++++++++++++--
 3 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml
index 290cfbee325..566e7db82d8 100644
--- a/.github/workflows/llm-benchmark-periodic.yml
+++ b/.github/workflows/llm-benchmark-periodic.yml
@@ -64,7 +64,7 @@ jobs:
       - name: Setup .NET SDK
         uses: actions/setup-dotnet@v4
         with:
-          dotnet-version: "8.0.x"
+          global-json-file: global.json
 
       - name: Install WASI workload
         env:
diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml
index a5199cb0bfe..fedbb0c406c 100644
--- a/.github/workflows/llm-benchmark-validate-goldens.yml
+++ b/.github/workflows/llm-benchmark-validate-goldens.yml
@@ -47,7 +47,7 @@ jobs:
         if: matrix.lang == 'csharp'
         uses: actions/setup-dotnet@v4
         with:
-          dotnet-version: "8.0.x"
+          global-json-file: global.json
 
       - name: Install WASI workload
         if: matrix.lang == 'csharp'
diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs
index 18d07dfa42b..0c67a0ffff7 100644
--- a/tools/xtask-llm-benchmark/src/bench/publishers.rs
+++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs
@@ -1,5 +1,5 @@
 use crate::bench::utils::sanitize_db_name;
-use anyhow::{bail, Result};
+use anyhow::{bail, Context, Result};
 use regex::Regex;
 use std::borrow::Cow;
 use std::env;
@@ -334,6 +334,41 @@ impl DotnetPublisher {
             .env("MSBUILDDISABLENODEREUSE", "1")
             .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0")
     }
+
+    fn built_wasm_path(project_path: &Path) -> Result<PathBuf> {
+        let config_name = "Release";
+        let subdir = if env::var_os("EXPERIMENTAL_WASM_AOT").is_some_and(|v| v == "1") {
+            "publish"
+        } else {
+            "AppBundle"
+        };
+        let output_paths = [
+            project_path.join(format!("bin/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")),
+            project_path.join(format!("bin~/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")),
+        ];
+
+        let mut found = output_paths.iter().filter(|path| path.exists()).collect::<Vec<_>>();
+        if found.len() > 1 {
+            bail!(
+                "C# build produced multiple StdbModule.wasm outputs in {}",
+                project_path.display()
+            );
+        }
+
+        let Some(wasm_path) = found.pop() else {
+            bail!(
+                "C# build finished but no StdbModule.wasm was found under {}",
+                project_path.display()
+            );
+        };
+
+        let optimized_path = wasm_path.with_extension("opt.wasm");
+        if optimized_path.exists() {
+            Ok(optimized_path)
+        } else {
+            Ok(wasm_path.to_path_buf())
+        }
+    }
 }
 
 impl Publisher for DotnetPublisher {
@@ -346,13 +381,18 @@ impl Publisher for DotnetPublisher {
         Self::ensure_csproj(source)?;
 
         let db = sanitize_db_name(module_name);
+        let source = source
+            .canonicalize()
+            .with_context(|| format!("failed to resolve C# source path {}", source.display()))?;
         let cli_root = isolated_cli_root()?;
 
         let mut cmd = spacetime_cmd(&cli_root);
-        cmd.arg("build").current_dir(source);
+        cmd.arg("build").arg("--module-path").arg(&source).current_dir(&source);
         Self::configure_dotnet_env(&mut cmd);
         run(&mut cmd, "spacetime build (csharp)")?;
 
+        let wasm_path = Self::built_wasm_path(&source)?;
+
         let mut pubcmd = spacetime_cmd(&cli_root);
         pubcmd
             .arg("publish")
@@ -360,8 +400,10 @@ impl Publisher for DotnetPublisher {
             .arg("-y")
             .arg("--server")
             .arg(host_url)
+            .arg("--bin-path")
+            .arg(&wasm_path)
             .arg(&db)
-            .current_dir(source);
+            .current_dir(&source);
         Self::configure_dotnet_env(&mut pubcmd);
         run(&mut pubcmd, "spacetime publish (csharp)")?;
 

From e58523f21f4f0828760a774324574e1fbe803fcf Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 17:41:29 -0400
Subject: [PATCH 13/25] Update publishers.rs

---
 tools/xtask-llm-benchmark/src/bench/publishers.rs | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs
index 0c67a0ffff7..f905a789601 100644
--- a/tools/xtask-llm-benchmark/src/bench/publishers.rs
+++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs
@@ -386,10 +386,15 @@ impl Publisher for DotnetPublisher {
             .with_context(|| format!("failed to resolve C# source path {}", source.display()))?;
         let cli_root = isolated_cli_root()?;
 
-        let mut cmd = spacetime_cmd(&cli_root);
-        cmd.arg("build").arg("--module-path").arg(&source).current_dir(&source);
+        let mut cmd = Command::new("dotnet");
+        cmd.arg("publish")
+            .arg("-c")
+            .arg("Release")
+            .arg("-v")
+            .arg("quiet")
+            .current_dir(&source);
         Self::configure_dotnet_env(&mut cmd);
-        run(&mut cmd, "spacetime build (csharp)")?;
+        run(&mut cmd, "dotnet publish (csharp)")?;
 
         let wasm_path = Self::built_wasm_path(&source)?;
 

From 032afd1797195db76ec03dc990081f7dcafd336e Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 21:08:37 -0400
Subject: [PATCH 14/25] fixes

---
 .github/workflows/llm-benchmark-periodic.yml  |  6 ++
 .../llm-benchmark-validate-goldens.yml        |  6 ++
 .../src/bench/templates.rs                    | 93 +++++++++++++++++--
 .../templates/csharp/server/StdbModule.csproj | 10 +-
 4 files changed, 100 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml
index 566e7db82d8..bc456520724 100644
--- a/.github/workflows/llm-benchmark-periodic.yml
+++ b/.github/workflows/llm-benchmark-periodic.yml
@@ -74,6 +74,12 @@ jobs:
         run: |
           dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel
 
+      - name: Pack C# runtime packages
+        if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'csharp') }}
+        run: |
+          dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime
+          dotnet pack -c Release crates/bindings-csharp/Runtime
+
       - name: Set up Node.js
         uses: actions/setup-node@v4
         with:
diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml
index fedbb0c406c..a2d2ef87a3e 100644
--- a/.github/workflows/llm-benchmark-validate-goldens.yml
+++ b/.github/workflows/llm-benchmark-validate-goldens.yml
@@ -58,6 +58,12 @@ jobs:
         run: |
           dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel
 
+      - name: Pack C# runtime packages
+        if: matrix.lang == 'csharp'
+        run: |
+          dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime
+          dotnet pack -c Release crates/bindings-csharp/Runtime
+
       - name: Set up Node.js
         if: matrix.lang == 'typescript'
         uses: actions/setup-node@v4
diff --git a/tools/xtask-llm-benchmark/src/bench/templates.rs b/tools/xtask-llm-benchmark/src/bench/templates.rs
index b5fa5f6add3..8ebafc2aefe 100644
--- a/tools/xtask-llm-benchmark/src/bench/templates.rs
+++ b/tools/xtask-llm-benchmark/src/bench/templates.rs
@@ -159,20 +159,99 @@ fn inject_csharp(root: &Path, llm_code: &str) -> anyhow::Result<()> {
     }
     fs::write(&prog, contents).with_context(|| format!("write {}", prog.display()))?;
 
-    let base_rel = relative_to_workspace(root, "crates/bindings-csharp")?;
     let runtime_csproj = workspace_root().join("crates/bindings-csharp/Runtime/Runtime.csproj");
     if !runtime_csproj.is_file() {
         bail!("local C# Runtime not found at {}", runtime_csproj.display());
     }
-    let runtime_ref = format!("{}/Runtime/Runtime.csproj", base_rel);
-    let runtime_dir = format!("{}/Runtime", base_rel);
-    let codegen_ref = format!("{}/Codegen/Codegen.csproj", base_rel);
+    let runtime_version = read_csharp_package_version(&runtime_csproj)?;
     let csproj_path = root.join("StdbModule.csproj");
     let mut csproj = fs::read_to_string(&csproj_path).with_context(|| format!("read {}", csproj_path.display()))?;
-    csproj = csproj.replace("{SPACETIME_CSHARP_RUNTIME_DIR}", &runtime_dir);
-    csproj = csproj.replace("{SPACETIME_CSHARP_RUNTIME_REF}", &runtime_ref);
-    csproj = csproj.replace("{SPACETIME_CSHARP_CODEGEN_REF}", &codegen_ref);
+    csproj = csproj.replace("{SPACETIME_CSHARP_RUNTIME_VERSION}", &runtime_version);
     fs::write(&csproj_path, csproj).with_context(|| format!("write {}", csproj_path.display()))?;
+
+    write_csharp_nuget_config(root)?;
+    Ok(())
+}
+
+fn read_csharp_package_version(csproj_path: &Path) -> Result<String> {
+    let contents = fs::read_to_string(csproj_path).with_context(|| format!("read {}", csproj_path.display()))?;
+    let version = contents
+        .split("<Version>")
+        .nth(1)
+        .and_then(|rest| rest.split("</Version>").next())
+        .map(str::trim)
+        .filter(|version| !version.is_empty())
+        .with_context(|| format!("missing <Version> in {}", csproj_path.display()))?;
+    Ok(version.to_owned())
+}
+
+fn normalize_nuget_path(path: &Path) -> String {
+    path.display()
+        .to_string()
+        .replace('\\', "/")
+        .trim_end_matches('/')
+        .to_string()
+}
+
+fn ensure_csharp_package_source(path: &Path, package_id: &str) -> Result<()> {
+    let has_package = fs::read_dir(path).ok().into_iter().flatten().flatten().any(|entry| {
+        entry
+            .file_name()
+            .to_str()
+            .is_some_and(|name| name.starts_with(package_id) && name.ends_with(".nupkg"))
+    });
+    if !has_package {
+        bail!(
+            "local C# package {} not found in {}. Run: dotnet pack -c Release crates/bindings-csharp/{}",
+            package_id,
+            path.display(),
+            package_id.strip_prefix("SpacetimeDB.").unwrap_or(package_id)
+        );
+    }
+    Ok(())
+}
+
+fn write_csharp_nuget_config(root: &Path) -> Result<()> {
+    let workspace = workspace_root();
+    let runtime_source = workspace.join("crates/bindings-csharp/Runtime/bin/Release");
+    let bsatn_source = workspace.join("crates/bindings-csharp/BSATN.Runtime/bin/Release");
+
+    ensure_csharp_package_source(&runtime_source, "SpacetimeDB.Runtime")?;
+    ensure_csharp_package_source(&bsatn_source, "SpacetimeDB.BSATN.Runtime")?;
+
+    let package_cache = root.join(".nuget/packages");
+    let nuget_config = format!(
+        r#"<?xml version="1.0" encoding="utf-8"?>
+<configuration>
+  <config>
+    <add key="globalPackagesFolder" value="{}" />
+  </config>
+  <packageSources>
+    <clear />
+    <add key="spacetimedb-runtime" value="{}" />
+    <add key="spacetimedb-bsatn-runtime" value="{}" />
+    <add key="nuget.org" value="https://api.nuget.org/v3/index.json" />
+  </packageSources>
+  <packageSourceMapping>
+    <packageSource key="spacetimedb-runtime">
+      <package pattern="SpacetimeDB.Runtime" />
+    </packageSource>
+    <packageSource key="spacetimedb-bsatn-runtime">
+      <package pattern="SpacetimeDB.BSATN.Runtime" />
+    </packageSource>
+    <packageSource key="nuget.org">
+      <package pattern="*" />
+    </packageSource>
+  </packageSourceMapping>
+</configuration>
+"#,
+        normalize_nuget_path(&package_cache),
+        normalize_nuget_path(&runtime_source),
+        normalize_nuget_path(&bsatn_source),
+    );
+
+    fs::write(root.join("nuget.config"), nuget_config)
+        .with_context(|| format!("write {}", root.join("nuget.config").display()))?;
     Ok(())
 }
 
diff --git a/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj b/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj
index ce04141c7a0..f286932badd 100644
--- a/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj
+++ b/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj
@@ -1,9 +1,5 @@
 <Project Sdk="Microsoft.NET.Sdk">
 
-  <!-- Import Runtime build props/targets when using ProjectReference (NuGet auto-imports these; ProjectReference does not) -->
-  <Import Project="{SPACETIME_CSHARP_RUNTIME_DIR}/build/SpacetimeDB.Runtime.props" />
-  <Import Project="{SPACETIME_CSHARP_RUNTIME_DIR}/build/SpacetimeDB.Runtime.targets" />
-
   <PropertyGroup>
     <TargetFramework>net8.0</TargetFramework>
     <RuntimeIdentifier>wasi-wasm</RuntimeIdentifier>
@@ -12,9 +8,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <ProjectReference Include="{SPACETIME_CSHARP_RUNTIME_REF}" />
-    <!-- Codegen is packed into the NuGet Runtime package; with ProjectReference we must add it explicitly -->
-    <ProjectReference Include="{SPACETIME_CSHARP_CODEGEN_REF}" OutputItemType="Analyzer" ReferenceOutputAssembly="false" />
+    <PackageReference Include="SpacetimeDB.Runtime" Version="{SPACETIME_CSHARP_RUNTIME_VERSION}" />
   </ItemGroup>
 
-</Project>
\ No newline at end of file
+</Project>

From 603741817b123aa41a54f90d4fac96f432e833c2 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 21:39:05 -0400
Subject: [PATCH 15/25] match smoketest (fingers crossed?)

---
 .github/workflows/llm-benchmark-periodic.yml  |   6 -
 .../llm-benchmark-validate-goldens.yml        |   6 -
 .../src/bench/publishers.rs                   |  70 ++---------
 .../src/bench/templates.rs                    | 111 +++++++++++++-----
 tools/xtask-llm-benchmark/src/bench/utils.rs  |   6 +-
 5 files changed, 97 insertions(+), 102 deletions(-)

diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml
index bc456520724..566e7db82d8 100644
--- a/.github/workflows/llm-benchmark-periodic.yml
+++ b/.github/workflows/llm-benchmark-periodic.yml
@@ -74,12 +74,6 @@ jobs:
         run: |
           dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel
 
-      - name: Pack C# runtime packages
-        if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'csharp') }}
-        run: |
-          dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime
-          dotnet pack -c Release crates/bindings-csharp/Runtime
-
       - name: Set up Node.js
         uses: actions/setup-node@v4
         with:
diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml
index a2d2ef87a3e..fedbb0c406c 100644
--- a/.github/workflows/llm-benchmark-validate-goldens.yml
+++ b/.github/workflows/llm-benchmark-validate-goldens.yml
@@ -58,12 +58,6 @@ jobs:
         run: |
           dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel
 
-      - name: Pack C# runtime packages
-        if: matrix.lang == 'csharp'
-        run: |
-          dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime
-          dotnet pack -c Release crates/bindings-csharp/Runtime
-
       - name: Set up Node.js
         if: matrix.lang == 'typescript'
         uses: actions/setup-node@v4
diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs
index f905a789601..6109b872314 100644
--- a/tools/xtask-llm-benchmark/src/bench/publishers.rs
+++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs
@@ -211,14 +211,14 @@ pub trait Publisher: Send + Sync {
 
 /// Check if the process was killed by a signal (e.g., SIGSEGV = 11)
 #[cfg(unix)]
-fn was_signal_killed(status: &std::process::ExitStatus) -> bool {
+fn signal_killed_by(status: &std::process::ExitStatus) -> Option<i32> {
     use std::os::unix::process::ExitStatusExt;
-    status.signal().is_some()
+    status.signal()
 }
 
 #[cfg(not(unix))]
-fn was_signal_killed(_status: &std::process::ExitStatus) -> bool {
-    false
+fn signal_killed_by(_status: &std::process::ExitStatus) -> Option<i32> {
+    None
 }
 
 /// Check if the failure is a transient error that should be retried.
@@ -282,13 +282,14 @@ fn run_with_retry(cmd: &mut Command, label: &str, max_retries: u32) -> Result<()
         let stderr = strip_ansi_codes(&stderr_raw);
         let stdout = strip_ansi_codes(&stdout_raw);
 
-        // Retry on signal kills (like SIGSEGV) or transient build errors
-        let should_retry = was_signal_killed(&out.status) || is_transient_build_error(&stderr, &stdout);
+        // Retry on signal kills (like SIGSEGV) or transient build errors.
+        let signal = signal_killed_by(&out.status);
+        let should_retry = signal.is_some() || is_transient_build_error(&stderr, &stdout);
         if should_retry && attempt < max_retries {
-            let reason = if was_signal_killed(&out.status) {
-                "signal kill"
+            let reason = if let Some(signal) = signal {
+                format!("signal {signal}")
             } else {
-                "transient build error"
+                "transient build error".to_string()
             };
             eprintln!("⚠️ {label}: {reason} detected, will retry...");
             last_error = Some(format!(
@@ -334,41 +335,6 @@ impl DotnetPublisher {
             .env("MSBUILDDISABLENODEREUSE", "1")
             .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0")
     }
-
-    fn built_wasm_path(project_path: &Path) -> Result<PathBuf> {
-        let config_name = "Release";
-        let subdir = if env::var_os("EXPERIMENTAL_WASM_AOT").is_some_and(|v| v == "1") {
-            "publish"
-        } else {
-            "AppBundle"
-        };
-        let output_paths = [
-            project_path.join(format!("bin/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")),
-            project_path.join(format!("bin~/{config_name}/net8.0/wasi-wasm/{subdir}/StdbModule.wasm")),
-        ];
-
-        let mut found = output_paths.iter().filter(|path| path.exists()).collect::<Vec<_>>();
-        if found.len() > 1 {
-            bail!(
-                "C# build produced multiple StdbModule.wasm outputs in {}",
-                project_path.display()
-            );
-        }
-
-        let Some(wasm_path) = found.pop() else {
-            bail!(
-                "C# build finished but no StdbModule.wasm was found under {}",
-                project_path.display()
-            );
-        };
-
-        let optimized_path = wasm_path.with_extension("opt.wasm");
-        if optimized_path.exists() {
-            Ok(optimized_path)
-        } else {
-            Ok(wasm_path.to_path_buf())
-        }
-    }
 }
 
 impl Publisher for DotnetPublisher {
@@ -386,18 +352,6 @@ impl Publisher for DotnetPublisher {
             .with_context(|| format!("failed to resolve C# source path {}", source.display()))?;
         let cli_root = isolated_cli_root()?;
 
-        let mut cmd = Command::new("dotnet");
-        cmd.arg("publish")
-            .arg("-c")
-            .arg("Release")
-            .arg("-v")
-            .arg("quiet")
-            .current_dir(&source);
-        Self::configure_dotnet_env(&mut cmd);
-        run(&mut cmd, "dotnet publish (csharp)")?;
-
-        let wasm_path = Self::built_wasm_path(&source)?;
-
         let mut pubcmd = spacetime_cmd(&cli_root);
         pubcmd
             .arg("publish")
@@ -405,8 +359,8 @@ impl Publisher for DotnetPublisher {
             .arg("-y")
             .arg("--server")
             .arg(host_url)
-            .arg("--bin-path")
-            .arg(&wasm_path)
+            .arg("--module-path")
+            .arg(&source)
             .arg(&db)
             .current_dir(&source);
         Self::configure_dotnet_env(&mut pubcmd);
diff --git a/tools/xtask-llm-benchmark/src/bench/templates.rs b/tools/xtask-llm-benchmark/src/bench/templates.rs
index 8ebafc2aefe..e6d8938481f 100644
--- a/tools/xtask-llm-benchmark/src/bench/templates.rs
+++ b/tools/xtask-llm-benchmark/src/bench/templates.rs
@@ -3,8 +3,17 @@ use anyhow::{bail, Context, Result};
 use std::{
     env, fs, io,
     path::{Path, PathBuf},
+    process::Command,
+    sync::OnceLock,
 };
 
+const CSHARP_PACKAGE_PROJECTS: [(&str, &str); 2] = [
+    ("BSATN.Runtime", "SpacetimeDB.BSATN.Runtime"),
+    ("Runtime", "SpacetimeDB.Runtime"),
+];
+
+static CSHARP_LOCAL_FEED: OnceLock<Result<PathBuf, String>> = OnceLock::new();
+
 pub fn materialize_project(
     lang: &str,
     category: &str,
@@ -193,33 +202,82 @@ fn normalize_nuget_path(path: &Path) -> String {
         .to_string()
 }
 
-fn ensure_csharp_package_source(path: &Path, package_id: &str) -> Result<()> {
-    let has_package = fs::read_dir(path).ok().into_iter().flatten().flatten().any(|entry| {
-        entry
-            .file_name()
-            .to_str()
-            .is_some_and(|name| name.starts_with(package_id) && name.ends_with(".nupkg"))
-    });
-    if !has_package {
-        bail!(
-            "local C# package {} not found in {}. Run: dotnet pack -c Release crates/bindings-csharp/{}",
-            package_id,
-            path.display(),
-            package_id.strip_prefix("SpacetimeDB.").unwrap_or(package_id)
-        );
+fn run_dotnet(mut cmd: Command, label: &str) -> Result<()> {
+    let debug = format!("{cmd:?}");
+    let output = cmd
+        .output()
+        .with_context(|| format!("failed to run {label}: {debug}"))?;
+    if output.status.success() {
+        return Ok(());
     }
-    Ok(())
+    bail!(
+        "{label} failed: {debug}\n--- stderr ---\n{}\n--- stdout ---\n{}",
+        String::from_utf8_lossy(&output.stderr),
+        String::from_utf8_lossy(&output.stdout)
+    );
 }
 
-fn write_csharp_nuget_config(root: &Path) -> Result<()> {
+fn csharp_local_feed() -> Result<PathBuf> {
+    match CSHARP_LOCAL_FEED.get_or_init(|| build_csharp_local_feed().map_err(|err| format!("{err:#}"))) {
+        Ok(path) => Ok(path.clone()),
+        Err(err) => bail!("{err}"),
+    }
+}
+
+fn build_csharp_local_feed() -> Result<PathBuf> {
     let workspace = workspace_root();
-    let runtime_source = workspace.join("crates/bindings-csharp/Runtime/bin/Release");
-    let bsatn_source = workspace.join("crates/bindings-csharp/BSATN.Runtime/bin/Release");
+    let bindings = workspace.join("crates/bindings-csharp");
+    let local_feed = workspace.join("target/llm-benchmark-csharp/local-feed");
 
-    ensure_csharp_package_source(&runtime_source, "SpacetimeDB.Runtime")?;
-    ensure_csharp_package_source(&bsatn_source, "SpacetimeDB.BSATN.Runtime")?;
+    if local_feed.exists() {
+        fs::remove_dir_all(&local_feed).with_context(|| format!("remove {}", local_feed.display()))?;
+    }
+    fs::create_dir_all(&local_feed).with_context(|| format!("create {}", local_feed.display()))?;
+
+    for (project_dir, _) in CSHARP_PACKAGE_PROJECTS {
+        let mut cmd = Command::new("dotnet");
+        cmd.arg("pack")
+            .arg("-c")
+            .arg("Release")
+            .arg("-o")
+            .arg(&local_feed)
+            .current_dir(bindings.join(project_dir));
+        run_dotnet(cmd, &format!("dotnet pack {project_dir}"))?;
+    }
+
+    let feed_files = fs::read_dir(&local_feed)
+        .with_context(|| format!("inspect {}", local_feed.display()))?
+        .flatten()
+        .filter_map(|entry| entry.file_name().into_string().ok())
+        .collect::<Vec<_>>();
+
+    for (_, package_id) in CSHARP_PACKAGE_PROJECTS {
+        let package_prefix = format!("{package_id}.");
+        if !feed_files
+            .iter()
+            .any(|name| name.starts_with(&package_prefix) && name.ends_with(".nupkg"))
+        {
+            bail!(
+                "local C# feed at {} is missing package {}. Found files: {:?}",
+                local_feed.display(),
+                package_id,
+                feed_files
+            );
+        }
+    }
+
+    Ok(local_feed)
+}
+
+fn write_csharp_nuget_config(root: &Path) -> Result<()> {
+    let local_feed = csharp_local_feed()?;
 
     let package_cache = root.join(".nuget/packages");
+    if package_cache.exists() {
+        fs::remove_dir_all(&package_cache).with_context(|| format!("remove {}", package_cache.display()))?;
+    }
+    fs::create_dir_all(&package_cache).with_context(|| format!("create {}", package_cache.display()))?;
+
     let nuget_config = format!(
         r#"<?xml version="1.0" encoding="utf-8"?>
 <configuration>
@@ -228,16 +286,12 @@ fn write_csharp_nuget_config(root: &Path) -> Result<()> {
   </config>
   <packageSources>
     <clear />
-    <add key="spacetimedb-runtime" value="{}" />
-    <add key="spacetimedb-bsatn-runtime" value="{}" />
+    <add key="spacetimedb-local" value="{}" />
     <add key="nuget.org" value="https://api.nuget.org/v3/index.json" />
   </packageSources>
   <packageSourceMapping>
-    <packageSource key="spacetimedb-runtime">
-      <package pattern="SpacetimeDB.Runtime" />
-    </packageSource>
-    <packageSource key="spacetimedb-bsatn-runtime">
-      <package pattern="SpacetimeDB.BSATN.Runtime" />
+    <packageSource key="spacetimedb-local">
+      <package pattern="SpacetimeDB.*" />
     </packageSource>
     <packageSource key="nuget.org">
       <package pattern="*" />
@@ -246,8 +300,7 @@ fn write_csharp_nuget_config(root: &Path) -> Result<()> {
 </configuration>
 "#,
         normalize_nuget_path(&package_cache),
-        normalize_nuget_path(&runtime_source),
-        normalize_nuget_path(&bsatn_source),
+        normalize_nuget_path(&local_feed),
     );
 
     fs::write(root.join("nuget.config"), nuget_config)
diff --git a/tools/xtask-llm-benchmark/src/bench/utils.rs b/tools/xtask-llm-benchmark/src/bench/utils.rs
index a8ccddc23e5..6e28315e4f6 100644
--- a/tools/xtask-llm-benchmark/src/bench/utils.rs
+++ b/tools/xtask-llm-benchmark/src/bench/utils.rs
@@ -109,13 +109,13 @@ pub fn bench_rust_concurrency() -> usize {
         .unwrap_or(2)
 }
 
-/// Concurrency for C# builds. Lower default than Rust due to dotnet/WASI SDK
-/// instability under high parallelism (causes SIGSEGV and "Pipe is broken" errors).
+/// Concurrency for C# builds. Keep this serialized to match smoketest behavior;
+/// dotnet/WASI SDK builds are fragile when multiple generated modules publish at once.
 pub fn bench_csharp_concurrency() -> usize {
     env::var("LLM_BENCH_CSHARP_CONCURRENCY")
         .ok()
         .and_then(|s| s.parse().ok())
-        .unwrap_or(4)
+        .unwrap_or(1)
 }
 
 pub fn bench_route_concurrency() -> usize {

From 2e6e02fe97bdfd7b23cb68d84fe007fc39189c37 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 21:55:18 -0400
Subject: [PATCH 16/25] fix

---
 .github/workflows/llm-benchmark-periodic.yml  |   6 +
 .../llm-benchmark-validate-goldens.yml        |   6 +
 .../src/bench/templates.rs                    | 106 +++++-------------
 3 files changed, 41 insertions(+), 77 deletions(-)

diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml
index 566e7db82d8..bc456520724 100644
--- a/.github/workflows/llm-benchmark-periodic.yml
+++ b/.github/workflows/llm-benchmark-periodic.yml
@@ -74,6 +74,12 @@ jobs:
         run: |
           dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel
 
+      - name: Pack C# runtime packages
+        if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'csharp') }}
+        run: |
+          dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime
+          dotnet pack -c Release crates/bindings-csharp/Runtime
+
       - name: Set up Node.js
         uses: actions/setup-node@v4
         with:
diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml
index fedbb0c406c..a2d2ef87a3e 100644
--- a/.github/workflows/llm-benchmark-validate-goldens.yml
+++ b/.github/workflows/llm-benchmark-validate-goldens.yml
@@ -58,6 +58,12 @@ jobs:
         run: |
           dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel
 
+      - name: Pack C# runtime packages
+        if: matrix.lang == 'csharp'
+        run: |
+          dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime
+          dotnet pack -c Release crates/bindings-csharp/Runtime
+
       - name: Set up Node.js
         if: matrix.lang == 'typescript'
         uses: actions/setup-node@v4
diff --git a/tools/xtask-llm-benchmark/src/bench/templates.rs b/tools/xtask-llm-benchmark/src/bench/templates.rs
index e6d8938481f..35176de8200 100644
--- a/tools/xtask-llm-benchmark/src/bench/templates.rs
+++ b/tools/xtask-llm-benchmark/src/bench/templates.rs
@@ -3,17 +3,8 @@ use anyhow::{bail, Context, Result};
 use std::{
     env, fs, io,
     path::{Path, PathBuf},
-    process::Command,
-    sync::OnceLock,
 };
 
-const CSHARP_PACKAGE_PROJECTS: [(&str, &str); 2] = [
-    ("BSATN.Runtime", "SpacetimeDB.BSATN.Runtime"),
-    ("Runtime", "SpacetimeDB.Runtime"),
-];
-
-static CSHARP_LOCAL_FEED: OnceLock<Result<PathBuf, String>> = OnceLock::new();
-
 pub fn materialize_project(
     lang: &str,
     category: &str,
@@ -202,75 +193,31 @@ fn normalize_nuget_path(path: &Path) -> String {
         .to_string()
 }
 
-fn run_dotnet(mut cmd: Command, label: &str) -> Result<()> {
-    let debug = format!("{cmd:?}");
-    let output = cmd
-        .output()
-        .with_context(|| format!("failed to run {label}: {debug}"))?;
-    if output.status.success() {
-        return Ok(());
-    }
-    bail!(
-        "{label} failed: {debug}\n--- stderr ---\n{}\n--- stdout ---\n{}",
-        String::from_utf8_lossy(&output.stderr),
-        String::from_utf8_lossy(&output.stdout)
-    );
-}
-
-fn csharp_local_feed() -> Result<PathBuf> {
-    match CSHARP_LOCAL_FEED.get_or_init(|| build_csharp_local_feed().map_err(|err| format!("{err:#}"))) {
-        Ok(path) => Ok(path.clone()),
-        Err(err) => bail!("{err}"),
+fn ensure_csharp_package_source(path: &Path, package_id: &str) -> Result<()> {
+    let has_package = fs::read_dir(path).ok().into_iter().flatten().flatten().any(|entry| {
+        entry
+            .file_name()
+            .to_str()
+            .is_some_and(|name| name.starts_with(package_id) && name.ends_with(".nupkg"))
+    });
+    if !has_package {
+        bail!(
+            "local C# package {} not found in {}. Run: dotnet pack -c Release crates/bindings-csharp/{}",
+            package_id,
+            path.display(),
+            package_id.strip_prefix("SpacetimeDB.").unwrap_or(package_id)
+        );
     }
+    Ok(())
 }
 
-fn build_csharp_local_feed() -> Result<PathBuf> {
+fn write_csharp_nuget_config(root: &Path) -> Result<()> {
     let workspace = workspace_root();
-    let bindings = workspace.join("crates/bindings-csharp");
-    let local_feed = workspace.join("target/llm-benchmark-csharp/local-feed");
+    let runtime_source = workspace.join("crates/bindings-csharp/Runtime/bin/Release");
+    let bsatn_source = workspace.join("crates/bindings-csharp/BSATN.Runtime/bin/Release");
 
-    if local_feed.exists() {
-        fs::remove_dir_all(&local_feed).with_context(|| format!("remove {}", local_feed.display()))?;
-    }
-    fs::create_dir_all(&local_feed).with_context(|| format!("create {}", local_feed.display()))?;
-
-    for (project_dir, _) in CSHARP_PACKAGE_PROJECTS {
-        let mut cmd = Command::new("dotnet");
-        cmd.arg("pack")
-            .arg("-c")
-            .arg("Release")
-            .arg("-o")
-            .arg(&local_feed)
-            .current_dir(bindings.join(project_dir));
-        run_dotnet(cmd, &format!("dotnet pack {project_dir}"))?;
-    }
-
-    let feed_files = fs::read_dir(&local_feed)
-        .with_context(|| format!("inspect {}", local_feed.display()))?
-        .flatten()
-        .filter_map(|entry| entry.file_name().into_string().ok())
-        .collect::<Vec<_>>();
-
-    for (_, package_id) in CSHARP_PACKAGE_PROJECTS {
-        let package_prefix = format!("{package_id}.");
-        if !feed_files
-            .iter()
-            .any(|name| name.starts_with(&package_prefix) && name.ends_with(".nupkg"))
-        {
-            bail!(
-                "local C# feed at {} is missing package {}. Found files: {:?}",
-                local_feed.display(),
-                package_id,
-                feed_files
-            );
-        }
-    }
-
-    Ok(local_feed)
-}
-
-fn write_csharp_nuget_config(root: &Path) -> Result<()> {
-    let local_feed = csharp_local_feed()?;
+    ensure_csharp_package_source(&runtime_source, "SpacetimeDB.Runtime")?;
+    ensure_csharp_package_source(&bsatn_source, "SpacetimeDB.BSATN.Runtime")?;
 
     let package_cache = root.join(".nuget/packages");
     if package_cache.exists() {
@@ -286,12 +233,16 @@ fn write_csharp_nuget_config(root: &Path) -> Result<()> {
   </config>
   <packageSources>
     <clear />
-    <add key="spacetimedb-local" value="{}" />
+    <add key="spacetimedb-runtime" value="{}" />
+    <add key="spacetimedb-bsatn-runtime" value="{}" />
     <add key="nuget.org" value="https://api.nuget.org/v3/index.json" />
   </packageSources>
   <packageSourceMapping>
-    <packageSource key="spacetimedb-local">
-      <package pattern="SpacetimeDB.*" />
+    <packageSource key="spacetimedb-runtime">
+      <package pattern="SpacetimeDB.Runtime" />
+    </packageSource>
+    <packageSource key="spacetimedb-bsatn-runtime">
+      <package pattern="SpacetimeDB.BSATN.Runtime" />
     </packageSource>
     <packageSource key="nuget.org">
       <package pattern="*" />
@@ -300,7 +251,8 @@ fn write_csharp_nuget_config(root: &Path) -> Result<()> {
 </configuration>
 "#,
         normalize_nuget_path(&package_cache),
-        normalize_nuget_path(&local_feed),
+        normalize_nuget_path(&runtime_source),
+        normalize_nuget_path(&bsatn_source),
     );
 
     fs::write(root.join("nuget.config"), nuget_config)

From b2308b1a6c906d0155359d83ff32bb61ba339fa4 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 22:09:02 -0400
Subject: [PATCH 17/25] shrug

---
 crates/cli/src/tasks/csharp.rs                | 87 ++++++++++---------
 .../src/bench/publishers.rs                   |  3 +
 2 files changed, 51 insertions(+), 39 deletions(-)

diff --git a/crates/cli/src/tasks/csharp.rs b/crates/cli/src/tasks/csharp.rs
index 5df8b730448..7f76b3199d2 100644
--- a/crates/cli/src/tasks/csharp.rs
+++ b/crates/cli/src/tasks/csharp.rs
@@ -1,5 +1,6 @@
 use anyhow::Context;
 use itertools::Itertools;
+use std::env;
 use std::ffi::OsString;
 use std::fs;
 use std::path::{Path, PathBuf};
@@ -8,6 +9,12 @@ fn parse_major_version(version: &str) -> Option<u8> {
     version.split('.').next()?.parse::<u8>().ok()
 }
 
+fn skip_workload_check() -> bool {
+    env::var("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK")
+        .ok()
+        .is_some_and(|value| matches!(value.as_str(), "1" | "true" | "TRUE" | "yes" | "YES"))
+}
+
 pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Result<PathBuf> {
     // All `dotnet` commands must execute in the project directory, otherwise
     // global.json won't have any effect and wrong .NET SDK might be picked.
@@ -17,46 +24,48 @@ pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Re
         };
     }
 
-    // Check if the `wasi-experimental` workload is installed. Unfortunately, we
-    // have to do this by inspecting the human-readable output. There is a
-    // hidden `--machine-readable` flag but it also mixes in human-readable
-    // output as well as unnecessarily updates various unrelated manifests.
-    match dotnet!("workload", "list").read() {
-        Ok(workloads) if workloads.contains("wasi-experimental") => {}
-        Ok(_) => {
-            // If wasi-experimental is not found, first check if we're running
-            // on .NET SDK 8.0. We can't even install that workload on older
-            // versions, and we don't support .NET 9.0 yet, so this helps to
-            // provide a nicer message than "Workload ID wasi-experimental is not recognized.".
-            let version = dotnet!("--version").read().unwrap_or_default();
-            if parse_major_version(&version) != Some(8) {
-                anyhow::bail!(concat!(
-                    ".NET SDK 8.0 is required, but found {version}.\n",
-                    "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json."
-                ));
-            }
+    if !skip_workload_check() {
+        // Check if the `wasi-experimental` workload is installed. Unfortunately, we
+        // have to do this by inspecting the human-readable output. There is a
+        // hidden `--machine-readable` flag but it also mixes in human-readable
+        // output as well as unnecessarily updates various unrelated manifests.
+        match dotnet!("workload", "list").read() {
+            Ok(workloads) if workloads.contains("wasi-experimental") => {}
+            Ok(_) => {
+                // If wasi-experimental is not found, first check if we're running
+                // on .NET SDK 8.0. We can't even install that workload on older
+                // versions, and we don't support .NET 9.0 yet, so this helps to
+                // provide a nicer message than "Workload ID wasi-experimental is not recognized.".
+                let version = dotnet!("--version").read().unwrap_or_default();
+                if parse_major_version(&version) != Some(8) {
+                    anyhow::bail!(concat!(
+                        ".NET SDK 8.0 is required, but found {version}.\n",
+                        "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json."
+                    ));
+                }
 
-            // Finally, try to install the workload ourselves. On some systems
-            // this might require elevated privileges, so print a nice error
-            // message if it fails.
-            dotnet!(
-                "workload",
-                "install",
-                "wasi-experimental",
-                "--skip-manifest-update"
-            )
-            .stderr_capture()
-            .run()
-            .context(concat!(
-                "Couldn't install the required wasi-experimental workload.\n",
-                "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights."
-            ))?;
-        }
-        Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
-            anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.")
-        }
-        Err(error) => anyhow::bail!("{error}"),
-    };
+                // Finally, try to install the workload ourselves. On some systems
+                // this might require elevated privileges, so print a nice error
+                // message if it fails.
+                dotnet!(
+                    "workload",
+                    "install",
+                    "wasi-experimental",
+                    "--skip-manifest-update"
+                )
+                .stderr_capture()
+                .run()
+                .context(concat!(
+                    "Couldn't install the required wasi-experimental workload.\n",
+                    "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights."
+                ))?;
+            }
+            Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
+                anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.")
+            }
+            Err(error) => anyhow::bail!("{error}"),
+        };
+    }
 
     let config_name = if build_debug { "Debug" } else { "Release" };
 
diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs
index 6109b872314..11fc75bbac5 100644
--- a/tools/xtask-llm-benchmark/src/bench/publishers.rs
+++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs
@@ -334,6 +334,9 @@ impl DotnetPublisher {
             // when running multiple dotnet builds in parallel.
             .env("MSBUILDDISABLENODEREUSE", "1")
             .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0")
+            // The workflow installs the WASI workload before running benchmarks.
+            // Avoid `dotnet workload list`, which can segfault on the CI runner.
+            .env("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK", "1")
     }
 }
 

From 2b133b8bc9630ecf7268c3ff8c950878cfc0246a Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 22:19:06 -0400
Subject: [PATCH 18/25] fix?

---
 crates/cli/src/tasks/csharp.rs                | 29 ++++++++++++++++---
 .../src/bench/publishers.rs                   |  2 ++
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/crates/cli/src/tasks/csharp.rs b/crates/cli/src/tasks/csharp.rs
index 7f76b3199d2..f9337cc42aa 100644
--- a/crates/cli/src/tasks/csharp.rs
+++ b/crates/cli/src/tasks/csharp.rs
@@ -9,12 +9,20 @@ fn parse_major_version(version: &str) -> Option<u8> {
     version.split('.').next()?.parse::<u8>().ok()
 }
 
-fn skip_workload_check() -> bool {
-    env::var("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK")
+fn env_flag(name: &str) -> bool {
+    env::var(name)
         .ok()
         .is_some_and(|value| matches!(value.as_str(), "1" | "true" | "TRUE" | "yes" | "YES"))
 }
 
+fn skip_workload_check() -> bool {
+    env_flag("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK")
+}
+
+fn stable_dotnet_publish() -> bool {
+    env_flag("SPACETIMEDB_CSHARP_STABLE_PUBLISH")
+}
+
 pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Result<PathBuf> {
     // All `dotnet` commands must execute in the project directory, otherwise
     // global.json won't have any effect and wrong .NET SDK might be picked.
@@ -77,8 +85,21 @@ pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Re
         )
     })?;
 
-    // run dotnet publish using cmd macro
-    dotnet!("publish", "-c", config_name, "-v", "quiet").run()?;
+    let mut publish_args = vec!["publish", "-c", config_name, "-v"];
+    if stable_dotnet_publish() {
+        publish_args.extend([
+            "minimal",
+            "--disable-build-servers",
+            "-m:1",
+            "-p:BuildInParallel=false",
+            "-p:RestoreDisableParallel=true",
+            "-p:UseSharedCompilation=false",
+        ]);
+    } else {
+        publish_args.push("quiet");
+    }
+
+    duct::cmd("dotnet", publish_args).dir(project_path).run()?;
 
     // check if file exists
     let subdir = if std::env::var_os("EXPERIMENTAL_WASM_AOT").is_some_and(|v| v == "1") {
diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs
index 11fc75bbac5..700cb955ea0 100644
--- a/tools/xtask-llm-benchmark/src/bench/publishers.rs
+++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs
@@ -337,6 +337,8 @@ impl DotnetPublisher {
             // The workflow installs the WASI workload before running benchmarks.
             // Avoid `dotnet workload list`, which can segfault on the CI runner.
             .env("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK", "1")
+            // Keep benchmark C# publishes on the conservative MSBuild path.
+            .env("SPACETIMEDB_CSHARP_STABLE_PUBLISH", "1")
     }
 }
 

From 7857671200ad19e0f926925a5c7bd4e213c657e5 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 22:51:54 -0400
Subject: [PATCH 19/25] testing

---
 crates/cli/src/tasks/csharp.rs                | 112 +++++++-----------
 .../src/bench/publishers.rs                   |   5 -
 .../src/bin/llm_benchmark.rs                  |  37 ++----
 3 files changed, 53 insertions(+), 101 deletions(-)

diff --git a/crates/cli/src/tasks/csharp.rs b/crates/cli/src/tasks/csharp.rs
index f9337cc42aa..5df8b730448 100644
--- a/crates/cli/src/tasks/csharp.rs
+++ b/crates/cli/src/tasks/csharp.rs
@@ -1,6 +1,5 @@
 use anyhow::Context;
 use itertools::Itertools;
-use std::env;
 use std::ffi::OsString;
 use std::fs;
 use std::path::{Path, PathBuf};
@@ -9,20 +8,6 @@ fn parse_major_version(version: &str) -> Option<u8> {
     version.split('.').next()?.parse::<u8>().ok()
 }
 
-fn env_flag(name: &str) -> bool {
-    env::var(name)
-        .ok()
-        .is_some_and(|value| matches!(value.as_str(), "1" | "true" | "TRUE" | "yes" | "YES"))
-}
-
-fn skip_workload_check() -> bool {
-    env_flag("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK")
-}
-
-fn stable_dotnet_publish() -> bool {
-    env_flag("SPACETIMEDB_CSHARP_STABLE_PUBLISH")
-}
-
 pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Result<PathBuf> {
     // All `dotnet` commands must execute in the project directory, otherwise
     // global.json won't have any effect and wrong .NET SDK might be picked.
@@ -32,48 +17,46 @@ pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Re
         };
     }
 
-    if !skip_workload_check() {
-        // Check if the `wasi-experimental` workload is installed. Unfortunately, we
-        // have to do this by inspecting the human-readable output. There is a
-        // hidden `--machine-readable` flag but it also mixes in human-readable
-        // output as well as unnecessarily updates various unrelated manifests.
-        match dotnet!("workload", "list").read() {
-            Ok(workloads) if workloads.contains("wasi-experimental") => {}
-            Ok(_) => {
-                // If wasi-experimental is not found, first check if we're running
-                // on .NET SDK 8.0. We can't even install that workload on older
-                // versions, and we don't support .NET 9.0 yet, so this helps to
-                // provide a nicer message than "Workload ID wasi-experimental is not recognized.".
-                let version = dotnet!("--version").read().unwrap_or_default();
-                if parse_major_version(&version) != Some(8) {
-                    anyhow::bail!(concat!(
-                        ".NET SDK 8.0 is required, but found {version}.\n",
-                        "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json."
-                    ));
-                }
-
-                // Finally, try to install the workload ourselves. On some systems
-                // this might require elevated privileges, so print a nice error
-                // message if it fails.
-                dotnet!(
-                    "workload",
-                    "install",
-                    "wasi-experimental",
-                    "--skip-manifest-update"
-                )
-                .stderr_capture()
-                .run()
-                .context(concat!(
-                    "Couldn't install the required wasi-experimental workload.\n",
-                    "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights."
-                ))?;
+    // Check if the `wasi-experimental` workload is installed. Unfortunately, we
+    // have to do this by inspecting the human-readable output. There is a
+    // hidden `--machine-readable` flag but it also mixes in human-readable
+    // output as well as unnecessarily updates various unrelated manifests.
+    match dotnet!("workload", "list").read() {
+        Ok(workloads) if workloads.contains("wasi-experimental") => {}
+        Ok(_) => {
+            // If wasi-experimental is not found, first check if we're running
+            // on .NET SDK 8.0. We can't even install that workload on older
+            // versions, and we don't support .NET 9.0 yet, so this helps to
+            // provide a nicer message than "Workload ID wasi-experimental is not recognized.".
+            let version = dotnet!("--version").read().unwrap_or_default();
+            if parse_major_version(&version) != Some(8) {
+                anyhow::bail!(concat!(
+                    ".NET SDK 8.0 is required, but found {version}.\n",
+                    "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json."
+                ));
             }
-            Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
-                anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.")
-            }
-            Err(error) => anyhow::bail!("{error}"),
-        };
-    }
+
+            // Finally, try to install the workload ourselves. On some systems
+            // this might require elevated privileges, so print a nice error
+            // message if it fails.
+            dotnet!(
+                "workload",
+                "install",
+                "wasi-experimental",
+                "--skip-manifest-update"
+            )
+            .stderr_capture()
+            .run()
+            .context(concat!(
+                "Couldn't install the required wasi-experimental workload.\n",
+                "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights."
+            ))?;
+        }
+        Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
+            anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.")
+        }
+        Err(error) => anyhow::bail!("{error}"),
+    };
 
     let config_name = if build_debug { "Debug" } else { "Release" };
 
@@ -85,21 +68,8 @@ pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Re
         )
     })?;
 
-    let mut publish_args = vec!["publish", "-c", config_name, "-v"];
-    if stable_dotnet_publish() {
-        publish_args.extend([
-            "minimal",
-            "--disable-build-servers",
-            "-m:1",
-            "-p:BuildInParallel=false",
-            "-p:RestoreDisableParallel=true",
-            "-p:UseSharedCompilation=false",
-        ]);
-    } else {
-        publish_args.push("quiet");
-    }
-
-    duct::cmd("dotnet", publish_args).dir(project_path).run()?;
+    // run dotnet publish using cmd macro
+    dotnet!("publish", "-c", config_name, "-v", "quiet").run()?;
 
     // check if file exists
     let subdir = if std::env::var_os("EXPERIMENTAL_WASM_AOT").is_some_and(|v| v == "1") {
diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs
index 700cb955ea0..6109b872314 100644
--- a/tools/xtask-llm-benchmark/src/bench/publishers.rs
+++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs
@@ -334,11 +334,6 @@ impl DotnetPublisher {
             // when running multiple dotnet builds in parallel.
             .env("MSBUILDDISABLENODEREUSE", "1")
             .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0")
-            // The workflow installs the WASI workload before running benchmarks.
-            // Avoid `dotnet workload list`, which can segfault on the CI runner.
-            .env("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK", "1")
-            // Keep benchmark C# publishes on the conservative MSBuild path.
-            .env("SPACETIMEDB_CSHARP_STABLE_PUBLISH", "1")
     }
 }
 
diff --git a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
index 179be601634..6ec030a49e8 100644
--- a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
+++ b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
@@ -279,11 +279,7 @@ fn run_benchmarks(args: RunArgs) -> Result<()> {
         eprintln!("[warn] failed to upload task catalog: {e}");
     }
 
-    let RuntimeInit {
-        runtime,
-        provider: llm_provider,
-        guard,
-    } = initialize_runtime_and_provider(config.hash_only, config.goldens_only)?;
+    let RuntimeInit { runtime, guard } = initialize_runtime(config.hash_only)?;
 
     config.host = guard.as_ref().map(|g| g.host_url.clone());
 
@@ -309,12 +305,7 @@ fn run_benchmarks(args: RunArgs) -> Result<()> {
         return Ok(());
     }
 
-    if !config.goldens_only && !config.hash_only {
-        let rt = runtime.as_ref().expect("failed to initialize runtime for preflight");
-        let provider = llm_provider.as_ref().expect("llm provider required for preflight");
-        let routes = filter_routes(&config);
-        preflight_llm_routes(rt, provider.as_ref(), &routes, &modes)?;
-
+    let llm_provider = if !config.goldens_only && !config.hash_only {
         let rt = runtime.as_ref().expect("failed to initialize runtime for goldens");
         rt.block_on(ensure_goldens_built_once(
             config.host.clone(),
@@ -322,7 +313,15 @@ fn run_benchmarks(args: RunArgs) -> Result<()> {
             config.lang,
             selectors_ref,
         ))?;
-    }
+
+        let provider = make_provider_from_env()?;
+        let rt = runtime.as_ref().expect("failed to initialize runtime for preflight");
+        let routes = filter_routes(&config);
+        preflight_llm_routes(rt, provider.as_ref(), &routes, &modes)?;
+        Some(provider)
+    } else {
+        None
+    };
 
     let mut all_outcomes: Vec<RunOutcome> = Vec::new();
 
@@ -799,15 +798,13 @@ fn categories_to_set(v: Option<Vec<String>>) -> Option<HashSet<String>> {
 
 pub struct RuntimeInit {
     pub runtime: Option<Runtime>,
-    pub provider: Option<Arc<dyn LlmProvider>>,
     pub guard: Option<SpacetimeDbGuard>,
 }
 
-fn initialize_runtime_and_provider(hash_only: bool, goldens_only: bool) -> Result<RuntimeInit> {
+fn initialize_runtime(hash_only: bool) -> Result<RuntimeInit> {
     if hash_only {
         return Ok(RuntimeInit {
             runtime: None,
-            provider: None,
             guard: None,
         });
     }
@@ -817,18 +814,8 @@ fn initialize_runtime_and_provider(hash_only: bool, goldens_only: bool) -> Resul
 
     let runtime = tokio::runtime::Builder::new_multi_thread().enable_all().build()?;
 
-    if goldens_only {
-        return Ok(RuntimeInit {
-            runtime: Some(runtime),
-            provider: None,
-            guard: Some(spacetime),
-        });
-    }
-
-    let llm_provider = make_provider_from_env()?;
     Ok(RuntimeInit {
         runtime: Some(runtime),
-        provider: Some(llm_provider),
         guard: Some(spacetime),
     })
 }

From ee38f7a9d3d8a352afb749b15ccd749ec58d12f0 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 23:02:23 -0400
Subject: [PATCH 20/25] test

---
 crates/cli/src/tasks/csharp.rs                | 87 ++++++++++---------
 .../src/bench/publishers.rs                   |  3 +
 2 files changed, 51 insertions(+), 39 deletions(-)

diff --git a/crates/cli/src/tasks/csharp.rs b/crates/cli/src/tasks/csharp.rs
index 5df8b730448..7f76b3199d2 100644
--- a/crates/cli/src/tasks/csharp.rs
+++ b/crates/cli/src/tasks/csharp.rs
@@ -1,5 +1,6 @@
 use anyhow::Context;
 use itertools::Itertools;
+use std::env;
 use std::ffi::OsString;
 use std::fs;
 use std::path::{Path, PathBuf};
@@ -8,6 +9,12 @@ fn parse_major_version(version: &str) -> Option<u8> {
     version.split('.').next()?.parse::<u8>().ok()
 }
 
+fn skip_workload_check() -> bool {
+    env::var("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK")
+        .ok()
+        .is_some_and(|value| matches!(value.as_str(), "1" | "true" | "TRUE" | "yes" | "YES"))
+}
+
 pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Result<PathBuf> {
     // All `dotnet` commands must execute in the project directory, otherwise
     // global.json won't have any effect and wrong .NET SDK might be picked.
@@ -17,46 +24,48 @@ pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Re
         };
     }
 
-    // Check if the `wasi-experimental` workload is installed. Unfortunately, we
-    // have to do this by inspecting the human-readable output. There is a
-    // hidden `--machine-readable` flag but it also mixes in human-readable
-    // output as well as unnecessarily updates various unrelated manifests.
-    match dotnet!("workload", "list").read() {
-        Ok(workloads) if workloads.contains("wasi-experimental") => {}
-        Ok(_) => {
-            // If wasi-experimental is not found, first check if we're running
-            // on .NET SDK 8.0. We can't even install that workload on older
-            // versions, and we don't support .NET 9.0 yet, so this helps to
-            // provide a nicer message than "Workload ID wasi-experimental is not recognized.".
-            let version = dotnet!("--version").read().unwrap_or_default();
-            if parse_major_version(&version) != Some(8) {
-                anyhow::bail!(concat!(
-                    ".NET SDK 8.0 is required, but found {version}.\n",
-                    "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json."
-                ));
-            }
+    if !skip_workload_check() {
+        // Check if the `wasi-experimental` workload is installed. Unfortunately, we
+        // have to do this by inspecting the human-readable output. There is a
+        // hidden `--machine-readable` flag but it also mixes in human-readable
+        // output as well as unnecessarily updates various unrelated manifests.
+        match dotnet!("workload", "list").read() {
+            Ok(workloads) if workloads.contains("wasi-experimental") => {}
+            Ok(_) => {
+                // If wasi-experimental is not found, first check if we're running
+                // on .NET SDK 8.0. We can't even install that workload on older
+                // versions, and we don't support .NET 9.0 yet, so this helps to
+                // provide a nicer message than "Workload ID wasi-experimental is not recognized.".
+                let version = dotnet!("--version").read().unwrap_or_default();
+                if parse_major_version(&version) != Some(8) {
+                    anyhow::bail!(concat!(
+                        ".NET SDK 8.0 is required, but found {version}.\n",
+                        "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json."
+                    ));
+                }
 
-            // Finally, try to install the workload ourselves. On some systems
-            // this might require elevated privileges, so print a nice error
-            // message if it fails.
-            dotnet!(
-                "workload",
-                "install",
-                "wasi-experimental",
-                "--skip-manifest-update"
-            )
-            .stderr_capture()
-            .run()
-            .context(concat!(
-                "Couldn't install the required wasi-experimental workload.\n",
-                "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights."
-            ))?;
-        }
-        Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
-            anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.")
-        }
-        Err(error) => anyhow::bail!("{error}"),
-    };
+                // Finally, try to install the workload ourselves. On some systems
+                // this might require elevated privileges, so print a nice error
+                // message if it fails.
+                dotnet!(
+                    "workload",
+                    "install",
+                    "wasi-experimental",
+                    "--skip-manifest-update"
+                )
+                .stderr_capture()
+                .run()
+                .context(concat!(
+                    "Couldn't install the required wasi-experimental workload.\n",
+                    "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights."
+                ))?;
+            }
+            Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
+                anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.")
+            }
+            Err(error) => anyhow::bail!("{error}"),
+        };
+    }
 
     let config_name = if build_debug { "Debug" } else { "Release" };
 
diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs
index 6109b872314..11fc75bbac5 100644
--- a/tools/xtask-llm-benchmark/src/bench/publishers.rs
+++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs
@@ -334,6 +334,9 @@ impl DotnetPublisher {
             // when running multiple dotnet builds in parallel.
             .env("MSBUILDDISABLENODEREUSE", "1")
             .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0")
+            // The workflow installs the WASI workload before running benchmarks.
+            // Avoid `dotnet workload list`, which can segfault on the CI runner.
+            .env("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK", "1")
     }
 }
 

From 77e2924cb3bc84b0bab69982e69f4e2f8eb300a9 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 23:13:27 -0400
Subject: [PATCH 21/25] Update llm-benchmark-periodic.yml

---
 .github/workflows/llm-benchmark-periodic.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml
index bc456520724..da314290e71 100644
--- a/.github/workflows/llm-benchmark-periodic.yml
+++ b/.github/workflows/llm-benchmark-periodic.yml
@@ -81,16 +81,19 @@ jobs:
           dotnet pack -c Release crates/bindings-csharp/Runtime
 
       - name: Set up Node.js
+        if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }}
         uses: actions/setup-node@v4
         with:
           node-version: 22
 
       - name: Install pnpm
+        if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }}
         uses: ./.github/actions/setup-pnpm
         with:
           run_install: true
 
       - name: Build TypeScript SDK
+        if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }}
         run: pnpm build
         working-directory: crates/bindings-typescript
 

From 63a9c34025db60a214a7c973d8e8bfa1d03ea8d2 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Fri, 12 Jun 2026 23:15:19 -0400
Subject: [PATCH 22/25] revert tests

---
 crates/cli/src/tasks/csharp.rs                | 87 +++++++++----------
 .../src/bench/publishers.rs                   |  3 -
 2 files changed, 39 insertions(+), 51 deletions(-)

diff --git a/crates/cli/src/tasks/csharp.rs b/crates/cli/src/tasks/csharp.rs
index 7f76b3199d2..5df8b730448 100644
--- a/crates/cli/src/tasks/csharp.rs
+++ b/crates/cli/src/tasks/csharp.rs
@@ -1,6 +1,5 @@
 use anyhow::Context;
 use itertools::Itertools;
-use std::env;
 use std::ffi::OsString;
 use std::fs;
 use std::path::{Path, PathBuf};
@@ -9,12 +8,6 @@ fn parse_major_version(version: &str) -> Option<u8> {
     version.split('.').next()?.parse::<u8>().ok()
 }
 
-fn skip_workload_check() -> bool {
-    env::var("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK")
-        .ok()
-        .is_some_and(|value| matches!(value.as_str(), "1" | "true" | "TRUE" | "yes" | "YES"))
-}
-
 pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Result<PathBuf> {
     // All `dotnet` commands must execute in the project directory, otherwise
     // global.json won't have any effect and wrong .NET SDK might be picked.
@@ -24,48 +17,46 @@ pub(crate) fn build_csharp(project_path: &Path, build_debug: bool) -> anyhow::Re
         };
     }
 
-    if !skip_workload_check() {
-        // Check if the `wasi-experimental` workload is installed. Unfortunately, we
-        // have to do this by inspecting the human-readable output. There is a
-        // hidden `--machine-readable` flag but it also mixes in human-readable
-        // output as well as unnecessarily updates various unrelated manifests.
-        match dotnet!("workload", "list").read() {
-            Ok(workloads) if workloads.contains("wasi-experimental") => {}
-            Ok(_) => {
-                // If wasi-experimental is not found, first check if we're running
-                // on .NET SDK 8.0. We can't even install that workload on older
-                // versions, and we don't support .NET 9.0 yet, so this helps to
-                // provide a nicer message than "Workload ID wasi-experimental is not recognized.".
-                let version = dotnet!("--version").read().unwrap_or_default();
-                if parse_major_version(&version) != Some(8) {
-                    anyhow::bail!(concat!(
-                        ".NET SDK 8.0 is required, but found {version}.\n",
-                        "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json."
-                    ));
-                }
-
-                // Finally, try to install the workload ourselves. On some systems
-                // this might require elevated privileges, so print a nice error
-                // message if it fails.
-                dotnet!(
-                    "workload",
-                    "install",
-                    "wasi-experimental",
-                    "--skip-manifest-update"
-                )
-                .stderr_capture()
-                .run()
-                .context(concat!(
-                    "Couldn't install the required wasi-experimental workload.\n",
-                    "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights."
-                ))?;
+    // Check if the `wasi-experimental` workload is installed. Unfortunately, we
+    // have to do this by inspecting the human-readable output. There is a
+    // hidden `--machine-readable` flag but it also mixes in human-readable
+    // output as well as unnecessarily updates various unrelated manifests.
+    match dotnet!("workload", "list").read() {
+        Ok(workloads) if workloads.contains("wasi-experimental") => {}
+        Ok(_) => {
+            // If wasi-experimental is not found, first check if we're running
+            // on .NET SDK 8.0. We can't even install that workload on older
+            // versions, and we don't support .NET 9.0 yet, so this helps to
+            // provide a nicer message than "Workload ID wasi-experimental is not recognized.".
+            let version = dotnet!("--version").read().unwrap_or_default();
+            if parse_major_version(&version) != Some(8) {
+                anyhow::bail!(concat!(
+                    ".NET SDK 8.0 is required, but found {version}.\n",
+                    "If you have multiple versions of .NET SDK installed, configure your project using https://learn.microsoft.com/en-us/dotnet/core/tools/global-json."
+                ));
             }
-            Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
-                anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.")
-            }
-            Err(error) => anyhow::bail!("{error}"),
-        };
-    }
+
+            // Finally, try to install the workload ourselves. On some systems
+            // this might require elevated privileges, so print a nice error
+            // message if it fails.
+            dotnet!(
+                "workload",
+                "install",
+                "wasi-experimental",
+                "--skip-manifest-update"
+            )
+            .stderr_capture()
+            .run()
+            .context(concat!(
+                "Couldn't install the required wasi-experimental workload.\n",
+                "You might need to install it manually by running `dotnet workload install wasi-experimental` with privileged rights."
+            ))?;
+        }
+        Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
+            anyhow::bail!("dotnet not found in PATH. Please install .NET SDK 8.0.")
+        }
+        Err(error) => anyhow::bail!("{error}"),
+    };
 
     let config_name = if build_debug { "Debug" } else { "Release" };
 
diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs
index 11fc75bbac5..6109b872314 100644
--- a/tools/xtask-llm-benchmark/src/bench/publishers.rs
+++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs
@@ -334,9 +334,6 @@ impl DotnetPublisher {
             // when running multiple dotnet builds in parallel.
             .env("MSBUILDDISABLENODEREUSE", "1")
             .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0")
-            // The workflow installs the WASI workload before running benchmarks.
-            // Avoid `dotnet workload list`, which can segfault on the CI runner.
-            .env("SPACETIMEDB_SKIP_CSHARP_WORKLOAD_CHECK", "1")
     }
 }
 

From 9596077046a37eef609ca2f74f6407bd05b2ef26 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Mon, 15 Jun 2026 09:06:15 -0400
Subject: [PATCH 23/25] preflight no error; vendor to openrouter in periodic

---
 .github/workflows/llm-benchmark-periodic.yml     | 1 +
 tools/xtask-llm-benchmark/src/llm/clients/mod.rs | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml
index da314290e71..da3af16b609 100644
--- a/.github/workflows/llm-benchmark-periodic.yml
+++ b/.github/workflows/llm-benchmark-periodic.yml
@@ -112,6 +112,7 @@ jobs:
           OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          LLM_VENDOR: openrouter
           LLM_BENCHMARK_API_KEY: ${{ secrets.LLM_BENCHMARK_API_KEY }}
           LLM_BENCHMARK_UPLOAD_URL: ${{ secrets.LLM_BENCHMARK_UPLOAD_URL }}
           DOTNET_MULTILEVEL_LOOKUP: "0"
diff --git a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs
index 83454c2677c..254fe5b8f63 100644
--- a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs
+++ b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs
@@ -8,7 +8,7 @@ pub mod openai;
 pub mod openrouter;
 pub mod xai;
 
-use anyhow::{bail, Result};
+use anyhow::Result;
 use async_trait::async_trait;
 
 pub use anthropic::AnthropicClient;
@@ -44,11 +44,11 @@ pub trait LlmClient: Send + Sync {
     fn provider_name(&self) -> &'static str;
 
     async fn preflight(&self, model: &str) -> Result<ClientPreflight> {
-        bail!(
-            "{} credit preflight is not implemented for model '{}'",
+        Ok(ClientPreflight::new(format!(
+            "{} credit preflight not implemented for model '{}'; skipped",
             self.provider_name(),
             model
-        )
+        )))
     }
 
     async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result<LlmOutput>;

From 65e4539f04875dc645d89fcb974aa81ff0d379e8 Mon Sep 17 00:00:00 2001
From: bradleyshep <148254416+bradleyshep@users.noreply.github.com>
Date: Mon, 15 Jun 2026 10:11:12 -0400
Subject: [PATCH 24/25] lints

---
 .../xtask-llm-benchmark/src/bench/publishers.rs  | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs
index 6109b872314..92622972114 100644
--- a/tools/xtask-llm-benchmark/src/bench/publishers.rs
+++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs
@@ -482,15 +482,15 @@ impl Publisher for TypeScriptPublisher {
         if let Some(dir) = nodejs_dir {
             prepend_paths.push(dir);
         }
-        if let Some(ref pnpm) = pnpm_exe {
-            if let Some(parent) = pnpm.parent() {
-                prepend_paths.push(parent.to_path_buf());
-            }
+        if let Some(ref pnpm) = pnpm_exe
+            && let Some(parent) = pnpm.parent()
+        {
+            prepend_paths.push(parent.to_path_buf());
         }
-        if let Some(node) = node_exe {
-            if let Some(parent) = node.parent() {
-                prepend_paths.push(parent.to_path_buf());
-            }
+        if let Some(node) = node_exe
+            && let Some(parent) = node.parent()
+        {
+            prepend_paths.push(parent.to_path_buf());
         }
         let child_path = if !prepend_paths.is_empty() {
             let mut paths = path_entries();

From 23ab3f731a8afd61b93dc82180dc9b793cc10740 Mon Sep 17 00:00:00 2001
From: clockwork-labs-bot <clockwork-labs-bot@clockworklabs.io>
Date: Mon, 15 Jun 2026 23:47:53 -0400
Subject: [PATCH 25/25] Avoid .NET globalization crash in LLM benchmarks
 (#5335)

# Description of Changes

Sets `DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1` only on the benchmark
harness command that publishes generated C# modules. This keeps dotnet
startup out of localized DateTime/TimeZoneInfo formatting on the CI
runner, which was crashing before generated C# module publish could run.

Stacked on #5324.

```bash
gh workflow run llm-benchmark-periodic.yml \
  --repo ClockworkLabs/SpacetimeDB \
  --ref bot/debug-llm-csharp-publish \
  -f model_set=explicit \
  -f models="openrouter:openai/gpt-5.4-mini" \
  -f languages=rust,csharp,typescript \
  -f modes=guidelines \
  -f tasks=t_000_empty_reducers \
  -f dry_run=true
```

# API and ABI breaking changes

None.

# Expected complexity level and risk

1. CI benchmark harness environment fix.

# Testing

- [x] `cargo fmt --all`
- [x] `cargo check --manifest-path tools/xtask-llm-benchmark/Cargo.toml`
- [x] `ruby -e 'require "yaml";
YAML.load_file(".github/workflows/llm-benchmark-periodic.yml");
YAML.load_file(".github/workflows/llm-benchmark-validate-goldens.yml")'`\n-
[x] `git diff --check`

---------

Co-authored-by: clockwork-labs-bot <clockwork-labs-bot@users.noreply.github.com>
---
 tools/xtask-llm-benchmark/src/bench/publishers.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs
index 92622972114..b7fb74c6936 100644
--- a/tools/xtask-llm-benchmark/src/bench/publishers.rs
+++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs
@@ -330,6 +330,10 @@ impl DotnetPublisher {
     fn configure_dotnet_env(cmd: &mut Command) -> &mut Command {
         cmd.env("DOTNET_CLI_TELEMETRY_OPTOUT", "1")
             .env("DOTNET_NOLOGO", "1")
+            // The CI runner's .NET install can crash while formatting localized
+            // DateTime/TimeZoneInfo data before publish starts. Force invariant
+            // globalization so generated C# module publish reaches MSBuild.
+            .env("DOTNET_SYSTEM_GLOBALIZATION_INVARIANT", "1")
             // Prevent MSBuild node reuse issues that cause "Pipe is broken" errors
             // when running multiple dotnet builds in parallel.
             .env("MSBUILDDISABLENODEREUSE", "1")