diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml
index 40ad2c75fe4..da3af16b609 100644
--- a/.github/workflows/llm-benchmark-periodic.yml
+++ b/.github/workflows/llm-benchmark-periodic.yml
@@ -2,15 +2,23 @@ name: Periodic LLM benchmarks
 
 on:
   schedule:
-    # Daily at midnight UTC. Change to '0 */6 * * *' for every 6h,
-    # or '0 */4 * * *' for every 4h.
-    - cron: '0 0 * * *'
+    # Weekly on Monday at midnight UTC.
+    - cron: '0 0 * * 1'
   workflow_dispatch:
     inputs:
+      model_set:
+        description: 'Model set to run'
+        required: false
+        type: choice
+        options:
+          - website_active
+          - local_defaults
+          - explicit
+        default: website_active
       models:
-        description: 'Models to run (provider:model format, comma-separated, or "all")'
+        description: 'Space-separated provider:model groups. Required when model_set=explicit.'
         required: false
-        default: 'all'
+        default: ''
       languages:
         description: 'Languages to benchmark (comma-separated: rust,csharp,typescript)'
         required: false
@@ -19,12 +27,24 @@ on:
         description: 'Modes to run (comma-separated: guidelines,no_context,docs,...)'
         required: false
         default: 'guidelines,no_context'
+      categories:
+        description: 'Optional benchmark categories to run (comma-separated)'
+        required: false
+        default: ''
+      tasks:
+        description: 'Optional benchmark task ids/selectors to run (comma-separated)'
+        required: false
+        default: ''
+      dry_run:
+        description: 'Run benchmarks without uploading results'
+        required: false
+        default: 'false'
 
 permissions:
   contents: read
 
 concurrency:
-  group: llm-benchmark-periodic
+  group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
@@ -33,10 +53,9 @@ jobs:
     timeout-minutes: 180
 
     steps:
-      - name: Checkout master
+      - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          ref: master
           fetch-depth: 1
 
       - uses: dtolnay/rust-toolchain@stable
@@ -45,7 +64,7 @@ jobs:
       - name: Setup .NET SDK
         uses: actions/setup-dotnet@v4
         with:
-          dotnet-version: "8.0.x"
+          global-json-file: global.json
 
       - name: Install WASI workload
         env:
@@ -55,13 +74,28 @@ jobs:
         run: |
           dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel
 
+      - name: Pack C# runtime packages
+        if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'csharp') }}
+        run: |
+          dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime
+          dotnet pack -c Release crates/bindings-csharp/Runtime
+
       - name: Set up Node.js
+        if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }}
         uses: actions/setup-node@v4
         with:
           node-version: 22
 
       - name: Install pnpm
+        if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }}
         uses: ./.github/actions/setup-pnpm
+        with:
+          run_install: true
+
+      - name: Build TypeScript SDK
+        if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }}
+        run: pnpm build
+        working-directory: crates/bindings-typescript
 
       - name: Build llm-benchmark tool
         run: cargo install --path tools/xtask-llm-benchmark --locked
@@ -78,30 +112,87 @@ jobs:
           OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          LLM_VENDOR: openrouter
           LLM_BENCHMARK_API_KEY: ${{ secrets.LLM_BENCHMARK_API_KEY }}
           LLM_BENCHMARK_UPLOAD_URL: ${{ secrets.LLM_BENCHMARK_UPLOAD_URL }}
+          DOTNET_MULTILEVEL_LOOKUP: "0"
+          DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home
+          DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1"
           MSBUILDDISABLENODEREUSE: "1"
           DOTNET_CLI_USE_MSBUILD_SERVER: "0"
+          LLM_BENCH_CSHARP_CONCURRENCY: "1"
           INPUT_LANGUAGES: ${{ inputs.languages || 'rust,csharp,typescript' }}
-          INPUT_MODELS: ${{ inputs.models || 'all' }}
+          INPUT_MODEL_SET: ${{ inputs.model_set || 'website_active' }}
+          INPUT_MODELS: ${{ inputs.models || '' }}
           INPUT_MODES: ${{ inputs.modes || 'guidelines,no_context' }}
+          INPUT_CATEGORIES: ${{ inputs.categories || '' }}
+          INPUT_TASKS: ${{ inputs.tasks || '' }}
+          INPUT_DRY_RUN: ${{ inputs.dry_run || 'false' }}
         run: |
           LANGS="$INPUT_LANGUAGES"
+          MODEL_SET="$INPUT_MODEL_SET"
           MODELS="$INPUT_MODELS"
           MODES="$INPUT_MODES"
+          CATEGORIES="$INPUT_CATEGORIES"
+          TASKS="$INPUT_TASKS"
+          DRY_RUN="$INPUT_DRY_RUN"
+
+          case "$MODEL_SET" in
+            website_active)
+              if [ -n "$MODELS" ]; then
+                echo "::error::models is only valid when model_set=explicit"
+                exit 1
+              fi
+              ;;
+            local_defaults)
+              if [ -n "$MODELS" ]; then
+                echo "::error::models is only valid when model_set=explicit"
+                exit 1
+              fi
+              ;;
+            explicit)
+              if [ -z "$MODELS" ]; then
+                echo "::error::models is required when model_set=explicit"
+                exit 1
+              fi
+              read -r -a MODEL_ARGS <<< "$MODELS"
+              ;;
+            *)
+              echo "::error::unknown model_set '$MODEL_SET' (expected website_active, local_defaults, or explicit)"
+              exit 1
+              ;;
+          esac
 
           SUCCEEDED=0
           FAILED=0
           for LANG in $(echo "$LANGS" | tr ',' ' '); do
-            if [ "$MODELS" = "all" ]; then
-              if llm_benchmark run --lang "$LANG" --modes "$MODES"; then
+            EXTRA_ARGS=()
+            if [ -n "$CATEGORIES" ]; then
+              EXTRA_ARGS+=(--categories "$CATEGORIES")
+            fi
+            if [ -n "$TASKS" ]; then
+              EXTRA_ARGS+=(--tasks "$TASKS")
+            fi
+            if [ "$DRY_RUN" = "true" ]; then
+              EXTRA_ARGS+=(--dry-run)
+            fi
+
+            if [ "$MODEL_SET" = "website_active" ]; then
+              if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote "${EXTRA_ARGS[@]}"; then
+                SUCCEEDED=$((SUCCEEDED + 1))
+              else
+                echo "::warning::Benchmark run failed for lang=$LANG"
+                FAILED=$((FAILED + 1))
+              fi
+            elif [ "$MODEL_SET" = "local_defaults" ]; then
+              if llm_benchmark run --lang "$LANG" --modes "$MODES" "${EXTRA_ARGS[@]}"; then
                 SUCCEEDED=$((SUCCEEDED + 1))
               else
                 echo "::warning::Benchmark run failed for lang=$LANG"
                 FAILED=$((FAILED + 1))
               fi
             else
-              if llm_benchmark run --lang "$LANG" --modes "$MODES" --models "$MODELS"; then
+              if llm_benchmark run --lang "$LANG" --modes "$MODES" --models "${MODEL_ARGS[@]}" "${EXTRA_ARGS[@]}"; then
                 SUCCEEDED=$((SUCCEEDED + 1))
               else
                 echo "::warning::Benchmark run failed for lang=$LANG models=$MODELS"
@@ -110,7 +201,7 @@ jobs:
             fi
           done
           echo "Benchmark runs: $SUCCEEDED succeeded, $FAILED failed"
-          if [ "$SUCCEEDED" -eq 0 ] && [ "$FAILED" -gt 0 ]; then
-            echo "::error::All benchmark runs failed"
+          if [ "$FAILED" -gt 0 ]; then
+            echo "::error::$FAILED benchmark run(s) failed"
             exit 1
           fi
diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml
index 17384a654e3..a2d2ef87a3e 100644
--- a/.github/workflows/llm-benchmark-validate-goldens.yml
+++ b/.github/workflows/llm-benchmark-validate-goldens.yml
@@ -2,15 +2,26 @@ name: Validate LLM benchmark golden answers
 
 on:
   schedule:
-    # Nightly at 2 AM UTC
-    - cron: '0 2 * * *'
-  workflow_dispatch: {}
+    # Weekly on Monday at 2 AM UTC.
+    - cron: '0 2 * * 1'
+  workflow_dispatch:
+    inputs:
+      lang:
+        description: 'Language to validate for manual smoke runs'
+        required: false
+        type: choice
+        default: all
+        options:
+          - all
+          - rust
+          - csharp
+          - typescript
 
 permissions:
   contents: read
 
 concurrency:
-  group: llm-benchmark-validate-goldens
+  group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
@@ -21,13 +32,12 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        lang: [rust, csharp, typescript]
+        lang: ${{ fromJSON(github.event_name == 'workflow_dispatch' && inputs.lang != 'all' && format('["{0}"]', inputs.lang) || '["rust","csharp","typescript"]') }}
 
     steps:
-      - name: Checkout master
+      - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          ref: master
           fetch-depth: 1
 
       - uses: dtolnay/rust-toolchain@stable
@@ -37,7 +47,7 @@ jobs:
         if: matrix.lang == 'csharp'
         uses: actions/setup-dotnet@v4
         with:
-          dotnet-version: "8.0.x"
+          global-json-file: global.json
 
       - name: Install WASI workload
         if: matrix.lang == 'csharp'
@@ -48,6 +58,12 @@ jobs:
         run: |
           dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel
 
+      - name: Pack C# runtime packages
+        if: matrix.lang == 'csharp'
+        run: |
+          dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime
+          dotnet pack -c Release crates/bindings-csharp/Runtime
+
       - name: Set up Node.js
         if: matrix.lang == 'typescript'
         uses: actions/setup-node@v4
@@ -57,6 +73,13 @@ jobs:
       - name: Install pnpm
         if: matrix.lang == 'typescript'
         uses: ./.github/actions/setup-pnpm
+        with:
+          run_install: true
+
+      - name: Build TypeScript SDK
+        if: matrix.lang == 'typescript'
+        run: pnpm build
+        working-directory: crates/bindings-typescript
 
       - name: Build llm-benchmark tool
         run: cargo install --path tools/xtask-llm-benchmark --locked
@@ -70,7 +93,11 @@ jobs:
 
       - name: Validate golden answers (${{ matrix.lang }})
         env:
+          DOTNET_MULTILEVEL_LOOKUP: "0"
+          DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home
+          DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1"
           MSBUILDDISABLENODEREUSE: "1"
           DOTNET_CLI_USE_MSBUILD_SERVER: "0"
+          LLM_BENCH_CSHARP_CONCURRENCY: "1"
         run: |
           llm_benchmark run --goldens-only --lang ${{ matrix.lang }}
diff --git a/tools/xtask-llm-benchmark/src/api/client.rs b/tools/xtask-llm-benchmark/src/api/client.rs
index edc61756152..0b43ccb5bac 100644
--- a/tools/xtask-llm-benchmark/src/api/client.rs
+++ b/tools/xtask-llm-benchmark/src/api/client.rs
@@ -1,34 +1,118 @@
-use anyhow::{Context, Result};
+use anyhow::{anyhow, Context, Result};
 use serde_json::json;
 
 use crate::bench::normalize::{canonical_mode, normalize_model_names};
 use crate::bench::types::{Results, RunOutcome};
+use crate::llm::types::Vendor;
+use crate::llm::ModelRoute;
+
+#[derive(Debug)]
+struct RemoteModelRouteRow {
+    display_name: String,
+    vendor: String,
+    api_model: String,
+    openrouter_model: Option<String>,
+    active: Option<bool>,
+    available: Option<bool>,
+}
+
+fn read_string_field(row: &serde_json::Map<String, serde_json::Value>, keys: &[&str]) -> Option<String> {
+    keys.iter()
+        .find_map(|key| row.get(*key).and_then(|value| value.as_str()))
+        .map(str::to_string)
+}
+
+fn read_bool_field(row: &serde_json::Map<String, serde_json::Value>, keys: &[&str]) -> Option<bool> {
+    keys.iter()
+        .find_map(|key| row.get(*key).and_then(|value| value.as_bool()))
+}
+
+fn parse_model_route_value(value: serde_json::Value) -> Result<RemoteModelRouteRow> {
+    let row = value
+        .as_object()
+        .ok_or_else(|| anyhow!("remote model row must be an object"))?;
+
+    Ok(RemoteModelRouteRow {
+        display_name: read_string_field(row, &["display_name", "displayName", "name"]).unwrap_or_default(),
+        vendor: read_string_field(row, &["vendor"]).unwrap_or_default(),
+        api_model: read_string_field(row, &["api_model", "apiModel"]).unwrap_or_default(),
+        openrouter_model: read_string_field(row, &["openrouter_model", "openrouterModel"]),
+        active: read_bool_field(row, &["active"]),
+        available: read_bool_field(row, &["available"]),
+    })
+}
+
+fn parse_model_route_row(row: RemoteModelRouteRow) -> Result<Option<ModelRoute>> {
+    if row.active == Some(false) || row.available == Some(false) {
+        return Ok(None);
+    }
+
+    let vendor = Vendor::parse(&row.vendor).ok_or_else(|| anyhow!("unknown model vendor '{}'", row.vendor))?;
+    let display_name = row.display_name.trim();
+    let api_model = row.api_model.trim();
+
+    if display_name.is_empty() {
+        anyhow::bail!("remote model row is missing display_name");
+    }
+    if api_model.is_empty() {
+        anyhow::bail!("remote model row '{}' is missing api_model", display_name);
+    }
+
+    Ok(Some(ModelRoute::new(
+        display_name,
+        vendor,
+        api_model,
+        row.openrouter_model.as_deref().filter(|s| !s.trim().is_empty()),
+    )))
+}
+
+pub fn parse_model_routes_response(body: &serde_json::Value) -> Result<Vec<ModelRoute>> {
+    let models = body.get("models").unwrap_or(body);
+    let rows: Vec<serde_json::Value> =
+        serde_json::from_value(models.clone()).context("parse llm benchmark model rows")?;
+
+    let mut routes = Vec::new();
+    for row in rows.into_iter().map(parse_model_route_value) {
+        let row = row?;
+        if let Some(route) = parse_model_route_row(row)? {
+            routes.push(route);
+        }
+    }
+
+    if routes.is_empty() {
+        anyhow::bail!("no active available LLM benchmark models returned by website");
+    }
+
+    Ok(routes)
+}
 
 /// HTTP client for the SpacetimeDB LLM benchmark API (spacetime-web Postgres).
 ///
-/// Supports two POST endpoints that already exist in spacetime-web:
-/// - `POST /api/llm-benchmark-upload` — upload benchmark results
-/// - `POST /api/llm-benchmark-tasks` — upload task catalog
+/// Supports endpoints owned by spacetime-web:
+/// - `POST /api/llm-benchmark-upload` - upload benchmark results
+/// - `POST /api/llm-benchmark-tasks` - upload task catalog
+/// - `GET /api/llm-benchmark-models?active=true` - fetch active benchmark models
 #[derive(Clone)]
 pub struct ApiClient {
-    client: reqwest::blocking::Client,
     base_url: String,
     api_key: String,
 }
 
 impl ApiClient {
     pub fn new(base_url: &str, api_key: &str) -> Result<Self> {
-        let client = reqwest::blocking::Client::builder()
-            .timeout(std::time::Duration::from_secs(120))
-            .build()
-            .context("failed to build HTTP client")?;
         Ok(Self {
-            client,
             base_url: base_url.trim_end_matches('/').to_string(),
             api_key: api_key.to_string(),
         })
     }
 
+    fn client(&self) -> Result<reqwest::blocking::Client> {
+        reqwest::blocking::Client::builder()
+            .timeout(std::time::Duration::from_secs(120))
+            .build()
+            .context("failed to build HTTP client")
+    }
+
     /// Build from environment variables `LLM_BENCHMARK_UPLOAD_URL` and `LLM_BENCHMARK_API_KEY`.
     /// Returns `None` if `LLM_BENCHMARK_UPLOAD_URL` is not set.
     pub fn from_env() -> Result<Option<Self>> {
@@ -71,6 +155,7 @@ impl ApiClient {
         normalize_model_names(&mut results);
 
         let url = format!("{}/api/llm-benchmark-upload", self.base_url);
+        let client = self.client()?;
         let mut total_uploaded = 0usize;
 
         for lang_entry in &results.languages {
@@ -92,8 +177,7 @@ impl ApiClient {
                     "models": models_json,
                 });
 
-                let resp = self
-                    .client
+                let resp = client
                     .post(&url)
                     .header("Authorization", format!("Bearer {}", self.api_key))
                     .header("Content-Type", "application/json")
@@ -113,7 +197,7 @@ impl ApiClient {
                     let status = resp.status();
                     let body = resp.text().unwrap_or_default();
                     anyhow::bail!(
-                        "upload failed for {}/{}: {} — {}",
+                        "upload failed for {}/{}: {} - {}",
                         lang_entry.lang,
                         mode_entry.mode,
                         status,
@@ -126,6 +210,26 @@ impl ApiClient {
         Ok(total_uploaded)
     }
 
+    /// Fetch active/available benchmark models from the website model registry.
+    pub fn fetch_model_routes(&self) -> Result<Vec<ModelRoute>> {
+        let url = format!("{}/api/llm-benchmark-models?active=true", self.base_url);
+        let resp = self
+            .client()?
+            .get(&url)
+            .header("Authorization", format!("Bearer {}", self.api_key))
+            .send()
+            .context("fetch LLM benchmark models failed")?;
+
+        if resp.status().is_success() {
+            let body: serde_json::Value = resp.json().context("parse model registry response")?;
+            parse_model_routes_response(&body)
+        } else {
+            let status = resp.status();
+            let body = resp.text().unwrap_or_default();
+            anyhow::bail!("fetch LLM benchmark models failed: {} - {}", status, body);
+        }
+    }
+
     /// Upload the task catalog to `POST /api/llm-benchmark-tasks`, derived from
     /// the benchmarks directory structure on disk.
     pub fn upload_task_catalog(&self, bench_root: &std::path::Path) -> Result<usize> {
@@ -207,7 +311,7 @@ impl ApiClient {
         let payload = json!({ "categories": categories });
 
         let resp = self
-            .client
+            .client()?
             .post(&url)
             .header("Authorization", format!("Bearer {}", self.api_key))
             .header("Content-Type", "application/json")
@@ -239,7 +343,7 @@ impl ApiClient {
         let url = format!("{}/api/llm-benchmark-results?{}", self.base_url, params.join("&"));
 
         let resp = self
-            .client
+            .client()?
             .get(&url)
             .header("Authorization", format!("Bearer {}", self.api_key))
             .send()
@@ -282,7 +386,7 @@ impl ApiClient {
         let url = format!("{}/api/llm-benchmark-results?{}", self.base_url, params.join("&"));
 
         let resp = self
-            .client
+            .client()?
             .get(&url)
             .header("Authorization", format!("Bearer {}", self.api_key))
             .send()
@@ -316,7 +420,7 @@ impl ApiClient {
 
         let url = format!("{}/api/llm-benchmark-upload", self.base_url);
         let resp = self
-            .client
+            .client()?
             .post(&url)
             .header("Authorization", format!("Bearer {}", self.api_key))
             .header("Content-Type", "application/json")
@@ -334,3 +438,67 @@ impl ApiClient {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parses_active_available_model_routes() {
+        let body = json!({
+            "models": [
+                {
+                    "displayName": "GPT Test",
+                    "vendor": "openai",
+                    "apiModel": "gpt-test",
+                    "openrouterModel": "openai/gpt-test",
+                    "active": true,
+                    "available": true
+                },
+                {
+                    "displayName": "Inactive",
+                    "vendor": "openai",
+                    "apiModel": "inactive",
+                    "active": false,
+                    "available": true
+                },
+                {
+                    "displayName": "Unavailable",
+                    "vendor": "openai",
+                    "apiModel": "unavailable",
+                    "active": true,
+                    "available": false
+                }
+            ]
+        });
+
+        let routes = parse_model_routes_response(&body).unwrap();
+        assert_eq!(routes.len(), 1);
+        assert_eq!(routes[0].display_name, "GPT Test");
+        assert_eq!(routes[0].vendor, Vendor::OpenAi);
+        assert_eq!(routes[0].api_model, "gpt-test");
+        assert_eq!(routes[0].openrouter_model.as_deref(), Some("openai/gpt-test"));
+    }
+
+    #[test]
+    fn parses_snake_case_model_route_fields() {
+        let body = json!({
+            "models": [
+                {
+                    "display_name": "GPT Test",
+                    "vendor": "openai",
+                    "api_model": "gpt-test",
+                    "openrouter_model": "openai/gpt-test",
+                    "active": true,
+                    "available": true
+                }
+            ]
+        });
+
+        let routes = parse_model_routes_response(&body).unwrap();
+        assert_eq!(routes.len(), 1);
+        assert_eq!(routes[0].display_name, "GPT Test");
+        assert_eq!(routes[0].api_model, "gpt-test");
+        assert_eq!(routes[0].openrouter_model.as_deref(), Some("openai/gpt-test"));
+    }
+}
diff --git a/tools/xtask-llm-benchmark/src/bench/analysis.rs b/tools/xtask-llm-benchmark/src/bench/analysis.rs
index 0234cba1b8f..cb23fbb6cf5 100644
--- a/tools/xtask-llm-benchmark/src/bench/analysis.rs
+++ b/tools/xtask-llm-benchmark/src/bench/analysis.rs
@@ -27,10 +27,10 @@ pub async fn run_analysis(
     let prompt = build_prompt(lang, mode, model_name, bench_root, &failures);
 
     let route = ModelRoute::new(
-        "gpt-4.1-mini",
+        "gpt-5.4-mini",
         crate::llm::types::Vendor::OpenAi,
-        "gpt-4.1-mini",
-        Some("openai/gpt-4.1-mini"),
+        "gpt-5.4-mini",
+        Some("openai/gpt-5.4-mini"),
     );
 
     let built = BuiltPrompt {
diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs
index 68775ff631c..b7fb74c6936 100644
--- a/tools/xtask-llm-benchmark/src/bench/publishers.rs
+++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs
@@ -1,12 +1,16 @@
 use crate::bench::utils::sanitize_db_name;
-use anyhow::{bail, Result};
+use anyhow::{bail, Context, Result};
 use regex::Regex;
 use std::borrow::Cow;
 use std::env;
 use std::fs;
 use std::path::{Path, PathBuf};
 use std::process::Command;
-use std::sync::LazyLock;
+use std::sync::{
+    atomic::{AtomicU64, Ordering},
+    LazyLock,
+};
+use std::time::{SystemTime, UNIX_EPOCH};
 
 fn workspace_root() -> PathBuf {
     PathBuf::from(env!("CARGO_MANIFEST_DIR"))
@@ -31,6 +35,163 @@ fn pnpm_minimum_release_age() -> Result<String> {
         .ok_or_else(|| anyhow::anyhow!("pnpm-workspace.yaml is missing minimumReleaseAge"))
 }
 
+fn path_entries() -> Vec<PathBuf> {
+    #[cfg(windows)]
+    let path = env::var_os("Path").or_else(|| env::var_os("PATH"));
+    #[cfg(not(windows))]
+    let path = env::var_os("PATH");
+
+    path.map(|path| env::split_paths(&path).collect()).unwrap_or_default()
+}
+
+fn command_path_candidates(name: &str) -> Vec<String> {
+    #[cfg(windows)]
+    {
+        let path = Path::new(name);
+        if path.extension().is_some() {
+            vec![name.to_string()]
+        } else {
+            vec![
+                format!("{name}.cmd"),
+                format!("{name}.exe"),
+                format!("{name}.bat"),
+                name.to_string(),
+            ]
+        }
+    }
+    #[cfg(not(windows))]
+    {
+        vec![name.to_string()]
+    }
+}
+
+fn resolve_command_on_path(name: &str) -> Option<PathBuf> {
+    for dir in path_entries() {
+        for candidate in command_path_candidates(name) {
+            let path = dir.join(candidate);
+            if path.is_file() {
+                return Some(path);
+            }
+        }
+    }
+    None
+}
+
+fn configured_nodejs_dir() -> Option<PathBuf> {
+    env::var("NODEJS_DIR")
+        .ok()
+        .map(|s| s.trim().trim_matches('"').trim().to_string())
+        .filter(|s| !s.is_empty())
+        .map(PathBuf::from)
+}
+
+fn pnpm_in_dir(dir: &Path) -> Option<PathBuf> {
+    #[cfg(windows)]
+    {
+        for candidate in ["pnpm.cmd", "pnpm.exe", "pnpm.bat"] {
+            let path = dir.join(candidate);
+            if path.is_file() {
+                return Some(path);
+            }
+        }
+        None
+    }
+    #[cfg(not(windows))]
+    {
+        let path = dir.join("pnpm");
+        path.is_file().then_some(path)
+    }
+}
+
+fn node_in_dir(dir: &Path) -> Option<PathBuf> {
+    #[cfg(windows)]
+    let path = dir.join("node.exe");
+    #[cfg(not(windows))]
+    let path = dir.join("node");
+
+    path.is_file().then_some(path)
+}
+
+fn resolve_node_exe(nodejs_dir: Option<&Path>) -> Option<PathBuf> {
+    nodejs_dir
+        .and_then(node_in_dir)
+        .or_else(|| resolve_command_on_path("node"))
+        .or_else(|| {
+            env::var("NVM_SYMLINK")
+                .ok()
+                .map(PathBuf::from)
+                .and_then(|dir| node_in_dir(&dir))
+        })
+}
+
+struct CliRootDir {
+    path: PathBuf,
+}
+
+impl CliRootDir {
+    fn path(&self) -> &Path {
+        &self.path
+    }
+}
+
+impl Drop for CliRootDir {
+    fn drop(&mut self) {
+        let _ = fs::remove_dir_all(&self.path);
+    }
+}
+
+fn isolated_cli_root() -> Result<CliRootDir> {
+    static COUNTER: AtomicU64 = AtomicU64::new(0);
+
+    for _ in 0..16 {
+        let nanos = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .map(|duration| duration.as_nanos())
+            .unwrap_or(0);
+        let id = COUNTER.fetch_add(1, Ordering::Relaxed);
+        let path = env::temp_dir().join(format!("stdb-llm-cli-{}-{nanos}-{id}", std::process::id()));
+        match fs::create_dir(&path) {
+            Ok(()) => return Ok(CliRootDir { path }),
+            Err(error) if error.kind() == std::io::ErrorKind::AlreadyExists => continue,
+            Err(error) => return Err(error.into()),
+        }
+    }
+
+    bail!("failed to create isolated SpacetimeDB CLI root directory");
+}
+
+fn spacetime_cmd(cli_root: &CliRootDir) -> Command {
+    let mut cmd = Command::new("spacetime");
+    cmd.arg("--root-dir").arg(cli_root.path());
+    cmd
+}
+
+fn pnpm_cjs_for_cmd(pnpm: &Path) -> Option<PathBuf> {
+    #[cfg(windows)]
+    {
+        let is_cmd = pnpm
+            .extension()
+            .and_then(|ext| ext.to_str())
+            .is_some_and(|ext| ext.eq_ignore_ascii_case("cmd"));
+        if !is_cmd {
+            return None;
+        }
+
+        let cjs = pnpm
+            .parent()?
+            .join("node_modules")
+            .join("pnpm")
+            .join("bin")
+            .join("pnpm.cjs");
+        cjs.is_file().then_some(cjs)
+    }
+    #[cfg(not(windows))]
+    {
+        let _ = pnpm;
+        None
+    }
+}
+
 /// Strip ANSI escape codes (color codes) from a string
 fn strip_ansi_codes(s: &str) -> Cow<'_, str> {
     static ANSI_RE: LazyLock<Regex> = LazyLock::new(|| {
@@ -50,14 +211,14 @@ pub trait Publisher: Send + Sync {
 
 /// Check if the process was killed by a signal (e.g., SIGSEGV = 11)
 #[cfg(unix)]
-fn was_signal_killed(status: &std::process::ExitStatus) -> bool {
+fn signal_killed_by(status: &std::process::ExitStatus) -> Option<i32> {
     use std::os::unix::process::ExitStatusExt;
-    status.signal().is_some()
+    status.signal()
 }
 
 #[cfg(not(unix))]
-fn was_signal_killed(_status: &std::process::ExitStatus) -> bool {
-    false
+fn signal_killed_by(_status: &std::process::ExitStatus) -> Option<i32> {
+    None
 }
 
 /// Check if the failure is a transient error that should be retried.
@@ -73,6 +234,8 @@ fn is_transient_build_error(stderr: &str, stdout: &str) -> bool {
         // trying to extract the same tarball simultaneously
         || (combined.contains("wasi-sdk") && combined.contains("tar"))
         || (combined.contains("MSB3073") && combined.contains("exited with code 2"))
+        // dotnet can crash below spacetime while spacetime exits 1.
+        || combined.contains("code <signal")
 }
 
 fn run(cmd: &mut Command, label: &str) -> Result<()> {
@@ -119,13 +282,14 @@ fn run_with_retry(cmd: &mut Command, label: &str, max_retries: u32) -> Result<()
         let stderr = strip_ansi_codes(&stderr_raw);
         let stdout = strip_ansi_codes(&stdout_raw);
 
-        // Retry on signal kills (like SIGSEGV) or transient build errors
-        let should_retry = was_signal_killed(&out.status) || is_transient_build_error(&stderr, &stdout);
+        // Retry on signal kills (like SIGSEGV) or transient build errors.
+        let signal = signal_killed_by(&out.status);
+        let should_retry = signal.is_some() || is_transient_build_error(&stderr, &stdout);
         if should_retry && attempt < max_retries {
-            let reason = if was_signal_killed(&out.status) {
-                "signal kill"
+            let reason = if let Some(signal) = signal {
+                format!("signal {signal}")
             } else {
-                "transient build error"
+                "transient build error".to_string()
             };
             eprintln!("⚠️ {label}: {reason} detected, will retry...");
             last_error = Some(format!(
@@ -162,6 +326,19 @@ impl DotnetPublisher {
         }
         Ok(())
     }
+
+    fn configure_dotnet_env(cmd: &mut Command) -> &mut Command {
+        cmd.env("DOTNET_CLI_TELEMETRY_OPTOUT", "1")
+            .env("DOTNET_NOLOGO", "1")
+            // The CI runner's .NET install can crash while formatting localized
+            // DateTime/TimeZoneInfo data before publish starts. Force invariant
+            // globalization so generated C# module publish reaches MSBuild.
+            .env("DOTNET_SYSTEM_GLOBALIZATION_INVARIANT", "1")
+            // Prevent MSBuild node reuse issues that cause "Pipe is broken" errors
+            // when running multiple dotnet builds in parallel.
+            .env("MSBUILDDISABLENODEREUSE", "1")
+            .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0")
+    }
 }
 
 impl Publisher for DotnetPublisher {
@@ -174,27 +351,23 @@ impl Publisher for DotnetPublisher {
         Self::ensure_csproj(source)?;
 
         let db = sanitize_db_name(module_name);
+        let source = source
+            .canonicalize()
+            .with_context(|| format!("failed to resolve C# source path {}", source.display()))?;
+        let cli_root = isolated_cli_root()?;
 
-        let mut cmd = Command::new("spacetime");
-        cmd.arg("build")
-            .current_dir(source)
-            .env("DOTNET_CLI_TELEMETRY_OPTOUT", "1")
-            .env("DOTNET_NOLOGO", "1")
-            // Prevent MSBuild node reuse issues that cause "Pipe is broken" errors
-            // when running multiple dotnet builds in parallel.
-            .env("MSBUILDDISABLENODEREUSE", "1")
-            .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0");
-        run(&mut cmd, "spacetime build (csharp)")?;
-
-        let mut pubcmd = Command::new("spacetime");
+        let mut pubcmd = spacetime_cmd(&cli_root);
         pubcmd
             .arg("publish")
             .arg("-c")
             .arg("-y")
             .arg("--server")
             .arg(host_url)
+            .arg("--module-path")
+            .arg(&source)
             .arg(&db)
-            .current_dir(source);
+            .current_dir(&source);
+        Self::configure_dotnet_env(&mut pubcmd);
         run(&mut pubcmd, "spacetime publish (csharp)")?;
 
         Ok(())
@@ -228,10 +401,11 @@ impl Publisher for SpacetimeRustPublisher {
 
         // sanitize db + server
         let db = sanitize_db_name(module_name);
+        let cli_root = isolated_cli_root()?;
 
         // 2) Publish
         run(
-            Command::new("spacetime")
+            spacetime_cmd(&cli_root)
                 .arg("publish")
                 .arg("-c")
                 .arg("-y")
@@ -271,51 +445,34 @@ impl Publisher for TypeScriptPublisher {
 
         Self::ensure_package_json(source)?;
         let db = sanitize_db_name(module_name);
+        let cli_root = isolated_cli_root()?;
 
         // Install dependencies (--ignore-workspace to avoid parent workspace interference).
-        // If NODEJS_DIR is set (e.g. nvm4w on Windows), use full path to pnpm so spawn finds it.
-        let pnpm_exe = env::var("NODEJS_DIR")
-            .ok()
-            .map(|s| s.trim().trim_matches('"').trim().to_string())
-            .filter(|s| !s.is_empty())
-            .map(PathBuf::from)
-            .and_then(|dir| {
-                #[cfg(windows)]
-                {
-                    let pnpm_cmd = dir.join("pnpm.cmd");
-                    let pnpm_exe_path = dir.join("pnpm.exe");
-                    if pnpm_cmd.is_file() {
-                        eprintln!("[pnpm] using NODEJS_DIR: {} (pnpm.cmd)", dir.display());
-                        Some(pnpm_cmd)
-                    } else if pnpm_exe_path.is_file() {
-                        eprintln!("[pnpm] using NODEJS_DIR: {} (pnpm.exe)", dir.display());
-                        Some(pnpm_exe_path)
-                    } else {
-                        eprintln!(
-                            "[pnpm] NODEJS_DIR set to {} but pnpm.cmd/pnpm.exe not found there, using PATH",
-                            dir.display()
-                        );
-                        None
-                    }
-                }
-                #[cfg(not(windows))]
-                {
-                    let pnpm = dir.join("pnpm");
-                    if pnpm.is_file() {
-                        eprintln!("[pnpm] using NODEJS_DIR: {} (pnpm)", dir.display());
-                        Some(pnpm)
-                    } else {
-                        eprintln!(
-                            "[pnpm] NODEJS_DIR set to {} but pnpm not found there, using PATH",
-                            dir.display()
-                        );
-                        None
-                    }
-                }
-            });
-        let mut pnpm_cmd = match &pnpm_exe {
-            Some(p) => Command::new(p),
-            None => Command::new("pnpm"),
+        let nodejs_dir = configured_nodejs_dir();
+        let pnpm_exe = nodejs_dir
+            .as_deref()
+            .and_then(pnpm_in_dir)
+            .or_else(|| resolve_command_on_path("pnpm"));
+        if let Some(ref pnpm) = pnpm_exe {
+            eprintln!("[pnpm] using {}", pnpm.display());
+        } else if let Some(ref dir) = nodejs_dir {
+            eprintln!(
+                "[pnpm] NODEJS_DIR set to {} but pnpm not found there or on PATH",
+                dir.display()
+            );
+        }
+        let node_exe = resolve_node_exe(nodejs_dir.as_deref());
+        let pnpm_cjs = pnpm_exe.as_deref().and_then(pnpm_cjs_for_cmd);
+        let mut pnpm_cmd = if let (Some(node), Some(cjs)) = (&node_exe, pnpm_cjs) {
+            eprintln!("[pnpm] invoking {} {}", node.display(), cjs.display());
+            let mut cmd = Command::new(node);
+            cmd.arg(cjs);
+            cmd
+        } else {
+            match &pnpm_exe {
+                Some(p) => Command::new(p),
+                None => Command::new("pnpm"),
+            }
         };
         pnpm_cmd
             .arg("install")
@@ -325,30 +482,62 @@ impl Publisher for TypeScriptPublisher {
             // This install runs in a materialized project with workspace config
             // ignored, so pass the repo's pnpm package-age policy explicitly.
             .env("npm_config_minimum_release_age", pnpm_minimum_release_age()?);
-        // When using NODEJS_DIR, prepend it to PATH so pnpm.cmd can find node.
-        if let Some(ref dir) = pnpm_exe
-            && let Some(parent) = dir.parent()
+        let mut prepend_paths = Vec::new();
+        if let Some(dir) = nodejs_dir {
+            prepend_paths.push(dir);
+        }
+        if let Some(ref pnpm) = pnpm_exe
+            && let Some(parent) = pnpm.parent()
+        {
+            prepend_paths.push(parent.to_path_buf());
+        }
+        if let Some(node) = node_exe
+            && let Some(parent) = node.parent()
         {
-            let mut paths: Vec<PathBuf> = env::split_paths(&env::var("PATH").unwrap_or_default()).collect();
-            paths.insert(0, parent.to_path_buf());
-            if let Ok(new_path) = env::join_paths(paths) {
-                pnpm_cmd.env("PATH", new_path);
+            prepend_paths.push(parent.to_path_buf());
+        }
+        let child_path = if !prepend_paths.is_empty() {
+            let mut paths = path_entries();
+            for path in prepend_paths.into_iter().rev() {
+                if !paths.iter().any(|existing| existing == &path) {
+                    paths.insert(0, path);
+                }
             }
+            env::join_paths(paths).ok()
+        } else {
+            None
+        };
+        if let Some(ref new_path) = child_path {
+            #[cfg(windows)]
+            {
+                pnpm_cmd.env_remove("PATH");
+                pnpm_cmd.env("Path", new_path);
+            }
+            #[cfg(not(windows))]
+            pnpm_cmd.env("PATH", new_path);
         }
         run(&mut pnpm_cmd, "pnpm install (typescript)")?;
 
         // Publish (spacetime CLI handles TypeScript compilation internally)
-        run(
-            Command::new("spacetime")
-                .arg("publish")
-                .arg("-c")
-                .arg("-y")
-                .arg("--server")
-                .arg(host_url)
-                .arg(&db)
-                .current_dir(source),
-            "spacetime publish (typescript)",
-        )?;
+        let mut publish_cmd = spacetime_cmd(&cli_root);
+        publish_cmd
+            .arg("publish")
+            .arg("-c")
+            .arg("-y")
+            .arg("--server")
+            .arg(host_url)
+            .arg(&db)
+            .current_dir(source);
+        if let Some(ref new_path) = child_path {
+            #[cfg(windows)]
+            {
+                publish_cmd.env_remove("PATH");
+                publish_cmd.env("Path", new_path);
+            }
+            #[cfg(not(windows))]
+            publish_cmd.env("PATH", new_path);
+        }
+        run(&mut publish_cmd, "spacetime publish (typescript)")?;
 
         Ok(())
     }
diff --git a/tools/xtask-llm-benchmark/src/bench/runner.rs b/tools/xtask-llm-benchmark/src/bench/runner.rs
index 42acd77a70c..2536b5e5fe1 100644
--- a/tools/xtask-llm-benchmark/src/bench/runner.rs
+++ b/tools/xtask-llm-benchmark/src/bench/runner.rs
@@ -473,6 +473,23 @@ async fn maybe_generate_analysis(cfg: &BenchRunContext<'_>, outcomes: &[RunOutco
     Ok(analysis)
 }
 
+async fn upload_batch_for_context(
+    cfg: &BenchRunContext<'_>,
+    outcomes: &[RunOutcome],
+    analysis: Option<&str>,
+) -> Result<()> {
+    if let Some(api) = cfg.api_client.clone() {
+        let mode = cfg.mode.to_string();
+        let outcomes = outcomes.to_vec();
+        let analysis = analysis.map(str::to_string);
+        tokio::task::spawn_blocking(move || api.upload_batch(&mode, &outcomes, analysis.as_deref())).await??;
+    } else {
+        eprintln!("[runner] no API client configured; skipping upload");
+    }
+
+    Ok(())
+}
+
 pub async fn run_all_for_model_async_for_lang(cfg: &BenchRunContext<'_>) -> Result<Vec<RunOutcome>> {
     let total_wall = Instant::now();
 
@@ -632,11 +649,7 @@ pub async fn run_all_for_model_async_for_lang(cfg: &BenchRunContext<'_>) -> Resu
                 None
             }
         };
-        if let Some(ref api) = cfg.api_client {
-            api.upload_batch(cfg.mode, &outcomes, analysis.as_deref())?;
-        } else {
-            eprintln!("[runner] no API client configured; skipping upload");
-        }
+        upload_batch_for_context(cfg, &outcomes, analysis.as_deref()).await?;
     } else {
         eprintln!("[runner] no results; skipping upload");
     }
@@ -831,11 +844,7 @@ pub async fn run_selected_for_model_async_for_lang(cfg: &BenchRunContext<'_>) ->
                 None
             }
         };
-        if let Some(ref api) = cfg.api_client {
-            api.upload_batch(cfg.mode, &outcomes, analysis.as_deref())?;
-        } else {
-            eprintln!("[runner] no API client configured; skipping upload");
-        }
+        upload_batch_for_context(cfg, &outcomes, analysis.as_deref()).await?;
     }
 
     println!(
diff --git a/tools/xtask-llm-benchmark/src/bench/templates.rs b/tools/xtask-llm-benchmark/src/bench/templates.rs
index b5fa5f6add3..35176de8200 100644
--- a/tools/xtask-llm-benchmark/src/bench/templates.rs
+++ b/tools/xtask-llm-benchmark/src/bench/templates.rs
@@ -159,20 +159,104 @@ fn inject_csharp(root: &Path, llm_code: &str) -> anyhow::Result<()> {
     }
     fs::write(&prog, contents).with_context(|| format!("write {}", prog.display()))?;
 
-    let base_rel = relative_to_workspace(root, "crates/bindings-csharp")?;
     let runtime_csproj = workspace_root().join("crates/bindings-csharp/Runtime/Runtime.csproj");
     if !runtime_csproj.is_file() {
         bail!("local C# Runtime not found at {}", runtime_csproj.display());
     }
-    let runtime_ref = format!("{}/Runtime/Runtime.csproj", base_rel);
-    let runtime_dir = format!("{}/Runtime", base_rel);
-    let codegen_ref = format!("{}/Codegen/Codegen.csproj", base_rel);
+    let runtime_version = read_csharp_package_version(&runtime_csproj)?;
     let csproj_path = root.join("StdbModule.csproj");
     let mut csproj = fs::read_to_string(&csproj_path).with_context(|| format!("read {}", csproj_path.display()))?;
-    csproj = csproj.replace("{SPACETIME_CSHARP_RUNTIME_DIR}", &runtime_dir);
-    csproj = csproj.replace("{SPACETIME_CSHARP_RUNTIME_REF}", &runtime_ref);
-    csproj = csproj.replace("{SPACETIME_CSHARP_CODEGEN_REF}", &codegen_ref);
+    csproj = csproj.replace("{SPACETIME_CSHARP_RUNTIME_VERSION}", &runtime_version);
     fs::write(&csproj_path, csproj).with_context(|| format!("write {}", csproj_path.display()))?;
+
+    write_csharp_nuget_config(root)?;
+    Ok(())
+}
+
+fn read_csharp_package_version(csproj_path: &Path) -> Result<String> {
+    let contents = fs::read_to_string(csproj_path).with_context(|| format!("read {}", csproj_path.display()))?;
+    let version = contents
+        .split("<Version>")
+        .nth(1)
+        .and_then(|rest| rest.split("</Version>").next())
+        .map(str::trim)
+        .filter(|version| !version.is_empty())
+        .with_context(|| format!("missing <Version> in {}", csproj_path.display()))?;
+    Ok(version.to_owned())
+}
+
+fn normalize_nuget_path(path: &Path) -> String {
+    path.display()
+        .to_string()
+        .replace('\\', "/")
+        .trim_end_matches('/')
+        .to_string()
+}
+
+fn ensure_csharp_package_source(path: &Path, package_id: &str) -> Result<()> {
+    let has_package = fs::read_dir(path).ok().into_iter().flatten().flatten().any(|entry| {
+        entry
+            .file_name()
+            .to_str()
+            .is_some_and(|name| name.starts_with(package_id) && name.ends_with(".nupkg"))
+    });
+    if !has_package {
+        bail!(
+            "local C# package {} not found in {}. Run: dotnet pack -c Release crates/bindings-csharp/{}",
+            package_id,
+            path.display(),
+            package_id.strip_prefix("SpacetimeDB.").unwrap_or(package_id)
+        );
+    }
+    Ok(())
+}
+
+fn write_csharp_nuget_config(root: &Path) -> Result<()> {
+    let workspace = workspace_root();
+    let runtime_source = workspace.join("crates/bindings-csharp/Runtime/bin/Release");
+    let bsatn_source = workspace.join("crates/bindings-csharp/BSATN.Runtime/bin/Release");
+
+    ensure_csharp_package_source(&runtime_source, "SpacetimeDB.Runtime")?;
+    ensure_csharp_package_source(&bsatn_source, "SpacetimeDB.BSATN.Runtime")?;
+
+    let package_cache = root.join(".nuget/packages");
+    if package_cache.exists() {
+        fs::remove_dir_all(&package_cache).with_context(|| format!("remove {}", package_cache.display()))?;
+    }
+    fs::create_dir_all(&package_cache).with_context(|| format!("create {}", package_cache.display()))?;
+
+    let nuget_config = format!(
+        r#"<?xml version="1.0" encoding="utf-8"?>
+<configuration>
+  <config>
+    <add key="globalPackagesFolder" value="{}" />
+  </config>
+  <packageSources>
+    <clear />
+    <add key="spacetimedb-runtime" value="{}" />
+    <add key="spacetimedb-bsatn-runtime" value="{}" />
+    <add key="nuget.org" value="https://api.nuget.org/v3/index.json" />
+  </packageSources>
+  <packageSourceMapping>
+    <packageSource key="spacetimedb-runtime">
+      <package pattern="SpacetimeDB.Runtime" />
+    </packageSource>
+    <packageSource key="spacetimedb-bsatn-runtime">
+      <package pattern="SpacetimeDB.BSATN.Runtime" />
+    </packageSource>
+    <packageSource key="nuget.org">
+      <package pattern="*" />
+    </packageSource>
+  </packageSourceMapping>
+</configuration>
+"#,
+        normalize_nuget_path(&package_cache),
+        normalize_nuget_path(&runtime_source),
+        normalize_nuget_path(&bsatn_source),
+    );
+
+    fs::write(root.join("nuget.config"), nuget_config)
+        .with_context(|| format!("write {}", root.join("nuget.config").display()))?;
     Ok(())
 }
 
diff --git a/tools/xtask-llm-benchmark/src/bench/types.rs b/tools/xtask-llm-benchmark/src/bench/types.rs
index 930e3feac1c..e54df0d4902 100644
--- a/tools/xtask-llm-benchmark/src/bench/types.rs
+++ b/tools/xtask-llm-benchmark/src/bench/types.rs
@@ -209,4 +209,6 @@ pub struct RunConfig {
     pub local_analysis: bool,
     /// Shared identifier used to group dry-run artifacts
     pub dry_run_id: Option<String>,
+    /// Website-provided route list used instead of static default_model_routes()
+    pub route_overrides: Option<Vec<ModelRoute>>,
 }
diff --git a/tools/xtask-llm-benchmark/src/bench/utils.rs b/tools/xtask-llm-benchmark/src/bench/utils.rs
index a8ccddc23e5..6e28315e4f6 100644
--- a/tools/xtask-llm-benchmark/src/bench/utils.rs
+++ b/tools/xtask-llm-benchmark/src/bench/utils.rs
@@ -109,13 +109,13 @@ pub fn bench_rust_concurrency() -> usize {
         .unwrap_or(2)
 }
 
-/// Concurrency for C# builds. Lower default than Rust due to dotnet/WASI SDK
-/// instability under high parallelism (causes SIGSEGV and "Pipe is broken" errors).
+/// Concurrency for C# builds. Keep this serialized to match smoketest behavior;
+/// dotnet/WASI SDK builds are fragile when multiple generated modules publish at once.
 pub fn bench_csharp_concurrency() -> usize {
     env::var("LLM_BENCH_CSHARP_CONCURRENCY")
         .ok()
         .and_then(|s| s.parse().ok())
-        .unwrap_or(4)
+        .unwrap_or(1)
 }
 
 pub fn bench_route_concurrency() -> usize {
diff --git a/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts
index 26c7dc9b230..1ba8ca175d1 100644
--- a/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts
+++ b/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts
@@ -2,7 +2,7 @@ import { schema, table, t } from 'spacetimedb/server';
 
 const eventLog = table({
   name: 'event_log',
-  indexes: [{ name: 'byCategorySeverity', algorithm: 'btree', columns: ['category', 'severity'] }],
+  indexes: [{ accessor: 'byCategorySeverity', algorithm: 'btree', columns: ['category', 'severity'] }],
 }, {
   id: t.u64().primaryKey().autoInc(),
   category: t.string(),
diff --git a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts
index 50d9f9c1dae..d23dead5a96 100644
--- a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts
+++ b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts
@@ -2,7 +2,7 @@ import { table, schema, t } from 'spacetimedb/server';
 
 const account = table({
   name: 'account',
-  indexes: [{ name: 'byName', algorithm: 'btree', columns: ['name'] }],
+  indexes: [{ accessor: 'byName', algorithm: 'btree', columns: ['name'] }],
 }, {
   id: t.u64().primaryKey().autoInc(),
   email: t.string().unique(),
diff --git a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts
index d7629137dcc..4ab152504d1 100644
--- a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts
+++ b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts
@@ -24,8 +24,8 @@ const membership = table(
   {
     name: 'membership',
     indexes: [
-      { name: 'byUser', algorithm: 'btree', columns: ['userId'] },
-      { name: 'byGroup', algorithm: 'btree', columns: ['groupId'] },
+      { accessor: 'byUser', algorithm: 'btree', columns: ['userId'] },
+      { accessor: 'byGroup', algorithm: 'btree', columns: ['groupId'] },
     ],
   },
   {
diff --git a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts
index 5d5fb568d7b..2f237fb0151 100644
--- a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts
+++ b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts
@@ -2,7 +2,7 @@ import { table, schema, t } from 'spacetimedb/server';
 
 const log = table({
   name: 'log',
-  indexes: [{ name: 'byUserDay', algorithm: 'btree', columns: ['userId', 'day'] }],
+  indexes: [{ accessor: 'byUserDay', algorithm: 'btree', columns: ['userId', 'day'] }],
 }, {
   id: t.u64().primaryKey().autoInc(),
   userId: t.i32(),
diff --git a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
index c624fdc4108..6ec030a49e8 100644
--- a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
+++ b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs
@@ -1,7 +1,7 @@
 #![allow(clippy::disallowed_macros, clippy::type_complexity, clippy::enum_variant_names)]
 
 use anyhow::{Context, Result};
-use clap::{Args, Parser, Subcommand};
+use clap::{Args, Parser, Subcommand, ValueEnum};
 use futures::{StreamExt, TryStreamExt};
 use spacetimedb_data_structures::map::{HashCollectionExt as _, HashMap, HashSet};
 use spacetimedb_guard::SpacetimeDbGuard;
@@ -71,6 +71,12 @@ struct Cli {
     command: Commands,
 }
 
+#[derive(Clone, Copy, Debug, Eq, PartialEq, ValueEnum)]
+enum ModelSource {
+    Static,
+    Remote,
+}
+
 #[derive(Subcommand, Debug)]
 enum Commands {
     /// Run benchmarks / build goldens / compute hashes.
@@ -124,6 +130,10 @@ struct RunArgs {
     #[arg(long, num_args = 1..)]
     models: Option<Vec<ModelGroup>>,
 
+    /// Where to resolve models when --models is not provided
+    #[arg(long, value_enum, default_value_t = ModelSource::Static)]
+    model_source: ModelSource,
+
     /// Run benchmarks without uploading results
     #[arg(long)]
     dry_run: bool,
@@ -131,6 +141,9 @@ struct RunArgs {
     /// When used with --dry-run, also generate local markdown analysis files
     #[arg(long, requires = "dry_run")]
     local_analysis: bool,
+
+    #[arg(skip)]
+    route_overrides: Option<Vec<ModelRoute>>,
 }
 
 #[derive(Args, Debug, Clone)]
@@ -218,14 +231,17 @@ fn run_benchmarks(args: RunArgs) -> Result<()> {
     let dry_run = args.dry_run;
     let local_analysis = args.local_analysis;
     let dry_run_id = dry_run.then(|| chrono::Utc::now().format("%Y-%m-%d_%H%M%S").to_string());
+    let should_fetch_remote_routes = should_fetch_remote_routes(&args);
 
-    let api_client = if dry_run {
-        None
-    } else {
+    let needs_api_client = should_fetch_remote_routes || !dry_run;
+    let api_client = if needs_api_client {
         ApiClient::from_env().context("failed to initialize API client")?
+    } else {
+        None
     };
+    let upload_client = if dry_run { None } else { api_client.clone() };
 
-    if api_client.is_none() && !dry_run {
+    if upload_client.is_none() && !dry_run {
         eprintln!("[warn] LLM_BENCHMARK_UPLOAD_URL not set; results will not be uploaded");
     }
 
@@ -240,31 +256,30 @@ fn run_benchmarks(args: RunArgs) -> Result<()> {
         categories: categories_to_set(args.categories),
         model_filter: model_filter_from_groups(args.models),
         host: None,
-        api_client: api_client.clone(),
+        api_client: upload_client.clone(),
         dry_run,
         local_analysis,
         dry_run_id: dry_run_id.clone(),
+        route_overrides: args.route_overrides,
     };
 
+    if should_fetch_remote_routes {
+        let api = api_client
+            .as_ref()
+            .context("LLM_BENCHMARK_UPLOAD_URL required when --model-source remote is used")?;
+        config.route_overrides = Some(api.fetch_model_routes()?);
+    }
+
     let bench_root = find_bench_root();
 
     // Upload task catalog before running benchmarks
-    if let Some(ref api) = api_client
+    if let Some(ref api) = upload_client
         && let Err(e) = api.upload_task_catalog(&bench_root)
     {
         eprintln!("[warn] failed to upload task catalog: {e}");
     }
 
-    let modes = config
-        .modes
-        .clone()
-        .unwrap_or_else(|| ALL_MODES.iter().map(|s| s.to_string()).collect());
-
-    let RuntimeInit {
-        runtime,
-        provider: llm_provider,
-        guard,
-    } = initialize_runtime_and_provider(config.hash_only, config.goldens_only)?;
+    let RuntimeInit { runtime, guard } = initialize_runtime(config.hash_only)?;
 
     config.host = guard.as_ref().map(|g| g.host_url.clone());
 
@@ -273,7 +288,24 @@ fn run_benchmarks(args: RunArgs) -> Result<()> {
     let selectors: Option<Vec<String>> = config.selectors.clone();
     let selectors_ref: Option<&[String]> = selectors.as_deref();
 
-    if !config.goldens_only && !config.hash_only {
+    let modes = config
+        .modes
+        .clone()
+        .unwrap_or_else(|| ALL_MODES.iter().map(|s| s.to_string()).collect());
+
+    if config.goldens_only {
+        let rt = runtime.as_ref().expect("runtime required for --goldens-only");
+        rt.block_on(build_goldens_only_for_lang(
+            config.host.clone(),
+            &bench_root,
+            config.lang,
+            selectors_ref,
+        ))?;
+        println!("[{}] goldens-only build complete", config.lang.as_str());
+        return Ok(());
+    }
+
+    let llm_provider = if !config.goldens_only && !config.hash_only {
         let rt = runtime.as_ref().expect("failed to initialize runtime for goldens");
         rt.block_on(ensure_goldens_built_once(
             config.host.clone(),
@@ -281,7 +313,15 @@ fn run_benchmarks(args: RunArgs) -> Result<()> {
             config.lang,
             selectors_ref,
         ))?;
-    }
+
+        let provider = make_provider_from_env()?;
+        let rt = runtime.as_ref().expect("failed to initialize runtime for preflight");
+        let routes = filter_routes(&config);
+        preflight_llm_routes(rt, provider.as_ref(), &routes, &modes)?;
+        Some(provider)
+    } else {
+        None
+    };
 
     let mut all_outcomes: Vec<RunOutcome> = Vec::new();
 
@@ -379,10 +419,10 @@ fn cmd_analyze(args: AnalyzeArgs) -> Result<()> {
     let provider = make_provider_from_env()?;
 
     let analysis_route = ModelRoute::new(
-        "gpt-4.1-mini",
+        "gpt-5.4-mini",
         xtask_llm_benchmark::llm::types::Vendor::OpenAi,
-        "gpt-4.1-mini",
-        Some("openai/gpt-4.1-mini"),
+        "gpt-5.4-mini",
+        Some("openai/gpt-5.4-mini"),
     );
 
     for ((lang, mode, model), group_failures) in &groups {
@@ -517,6 +557,59 @@ fn short_hash(s: &str) -> &str {
     &s[..s.len().min(12)]
 }
 
+fn should_fetch_remote_routes(args: &RunArgs) -> bool {
+    args.model_source == ModelSource::Remote
+        && args.models.is_none()
+        && args.route_overrides.is_none()
+        && !args.hash_only
+        && !args.goldens_only
+}
+
+fn preflight_llm_routes(
+    runtime: &Runtime,
+    llm_provider: &dyn LlmProvider,
+    routes: &[ModelRoute],
+    modes: &[String],
+) -> Result<()> {
+    if routes.is_empty() {
+        return Ok(());
+    }
+
+    let mut search_flags = Vec::new();
+    if modes.iter().any(|mode| mode == "search") {
+        search_flags.push(true);
+    }
+    if modes.iter().any(|mode| mode != "search") {
+        search_flags.push(false);
+    }
+
+    let mut failures = Vec::new();
+    for route in routes {
+        for search_enabled in &search_flags {
+            let mode_label = if *search_enabled {
+                "search/OpenRouter online"
+            } else {
+                "standard"
+            };
+
+            if let Err(err) = runtime.block_on(llm_provider.preflight_route(route, *search_enabled)) {
+                let msg = format!("{} ({mode_label}): {err:#}", route.display_name);
+                eprintln!("[preflight] FAILED {msg}");
+                failures.push(msg);
+            }
+        }
+    }
+
+    if !failures.is_empty() {
+        anyhow::bail!(
+            "LLM provider preflight failed before benchmark run:\n  - {}",
+            failures.join("\n  - ")
+        );
+    }
+
+    Ok(())
+}
+
 /// Run benchmarks for a single mode.
 fn run_mode_benchmarks(
     mode: &str,
@@ -538,15 +631,6 @@ fn run_mode_benchmarks(
         return Ok(Vec::new());
     }
 
-    if config.goldens_only {
-        let rt = runtime.expect("runtime required for --goldens-only");
-        let sels = config.selectors.as_deref();
-
-        rt.block_on(build_goldens_only_for_lang(config.host.clone(), bench_root, lang, sels))?;
-        println!("{:<12} [{:<10}] goldens-only build complete", mode, lang_str);
-        return Ok(Vec::new());
-    }
-
     // Run benchmarks for all matching routes
     let routes = filter_routes(config);
 
@@ -598,7 +682,12 @@ fn run_mode_benchmarks(
 /// When explicit `openrouter:vendor/model` entries are passed they won't appear in
 /// `default_model_routes`, so we synthesize ad-hoc routes for them here.
 fn filter_routes(config: &RunConfig) -> Vec<ModelRoute> {
-    let mut routes: Vec<ModelRoute> = default_model_routes()
+    let base_routes: Vec<ModelRoute> = config
+        .route_overrides
+        .clone()
+        .unwrap_or_else(|| default_model_routes().to_vec());
+
+    let mut routes: Vec<ModelRoute> = base_routes
         .iter()
         .filter(|r| config.providers_filter.as_ref().is_none_or(|f| f.contains(&r.vendor)))
         .filter(|r| match &config.model_filter {
@@ -627,6 +716,7 @@ fn filter_routes(config: &RunConfig) -> Vec<ModelRoute> {
                 let already_matched = routes.iter().any(|r| {
                     r.vendor == *vendor
                         && (r.api_model == model_id.as_str()
+                            || r.display_name.to_ascii_lowercase() == model_id.as_str()
                             || r.openrouter_model.as_deref() == Some(model_id.as_str()))
                 });
                 if !already_matched {
@@ -708,15 +798,13 @@ fn categories_to_set(v: Option<Vec<String>>) -> Option<HashSet<String>> {
 
 pub struct RuntimeInit {
     pub runtime: Option<Runtime>,
-    pub provider: Option<Arc<dyn LlmProvider>>,
     pub guard: Option<SpacetimeDbGuard>,
 }
 
-fn initialize_runtime_and_provider(hash_only: bool, goldens_only: bool) -> Result<RuntimeInit> {
+fn initialize_runtime(hash_only: bool) -> Result<RuntimeInit> {
     if hash_only {
         return Ok(RuntimeInit {
             runtime: None,
-            provider: None,
             guard: None,
         });
     }
@@ -726,18 +814,8 @@ fn initialize_runtime_and_provider(hash_only: bool, goldens_only: bool) -> Resul
 
     let runtime = tokio::runtime::Builder::new_multi_thread().enable_all().build()?;
 
-    if goldens_only {
-        return Ok(RuntimeInit {
-            runtime: Some(runtime),
-            provider: None,
-            guard: Some(spacetime),
-        });
-    }
-
-    let llm_provider = make_provider_from_env()?;
     Ok(RuntimeInit {
         runtime: Some(runtime),
-        provider: Some(llm_provider),
         guard: Some(spacetime),
     })
 }
@@ -753,8 +831,8 @@ fn find_bench_root() -> PathBuf {
     start.join("src").join("benchmarks")
 }
 
-fn collect_task_numbers_in_categories(bench_root: &Path, cats: &HashSet<String>) -> Result<HashSet<u32>> {
-    let mut nums = HashSet::new();
+fn collect_task_names_in_categories(bench_root: &Path, cats: &HashSet<String>) -> Result<HashSet<String>> {
+    let mut tasks = HashSet::new();
     for c in cats {
         let dir = bench_root.join(c);
         if !dir.is_dir() {
@@ -765,24 +843,38 @@ fn collect_task_numbers_in_categories(bench_root: &Path, cats: &HashSet<String>)
             if !entry.file_type()?.is_dir() {
                 continue;
             }
-            let name = entry.file_name().to_string_lossy().into_owned();
-            if let Some(rest) = name.strip_prefix("t_")
-                && let Some((num_str, _)) = rest.split_once('_')
-                && num_str.len() == 3
-                && let Ok(n) = num_str.parse::<u32>()
-            {
-                nums.insert(n);
-            }
+            tasks.insert(entry.file_name().to_string_lossy().to_ascii_lowercase());
         }
     }
-    Ok(nums)
+    Ok(tasks)
 }
 
-fn normalize_numeric_selectors(raw: &[String]) -> Vec<u32> {
-    raw.iter()
-        .filter(|s| !s.is_empty() && s.chars().all(|c| c.is_ascii_digit()))
-        .filter_map(|s| s.parse::<u32>().ok())
-        .collect()
+fn task_selector_matches_any(selector: &str, allowed_tasks: &HashSet<String>) -> bool {
+    allowed_tasks.iter().any(|task| task.starts_with(selector))
+}
+
+fn normalize_task_filter_selector(raw: &str) -> Result<String> {
+    let s = raw.trim().to_ascii_lowercase();
+    if s.is_empty() {
+        anyhow::bail!("empty task selector");
+    }
+    if let Some(rest) = s.strip_prefix("t_") {
+        if rest.chars().all(|c| c.is_ascii_digit()) {
+            let n: u32 = rest.parse()?;
+            return Ok(format!("t_{:03}", n));
+        }
+        if rest.chars().next().is_some_and(|c| c.is_ascii_digit())
+            && rest.chars().all(|c| c.is_ascii_alphanumeric() || c == '_')
+        {
+            return Ok(s);
+        }
+        anyhow::bail!("invalid task selector: {raw}");
+    }
+    if s.chars().all(|c| c.is_ascii_digit()) {
+        let n: u32 = s.parse()?;
+        return Ok(format!("t_{:03}", n));
+    }
+    anyhow::bail!("invalid task selector: {raw}")
 }
 
 fn apply_category_filter(
@@ -796,23 +888,148 @@ fn apply_category_filter(
             Ok(selectors.map(|s| s.to_vec()))
         }
         Some(cats) => {
-            let allowed = collect_task_numbers_in_categories(bench_root, cats)?;
-            let out_nums: Vec<u32> = match selectors {
+            let allowed = collect_task_names_in_categories(bench_root, cats)?;
+            let mut out: Vec<String> = match selectors {
                 Some(user) => {
-                    let nums = normalize_numeric_selectors(user);
-                    nums.into_iter().filter(|n| allowed.contains(n)).collect()
+                    let mut selected = Vec::new();
+                    for selector in user {
+                        let normalized = normalize_task_filter_selector(selector)?;
+                        if task_selector_matches_any(&normalized, &allowed) {
+                            selected.push(normalized);
+                        }
+                    }
+                    selected
                 }
                 None => {
-                    let mut v: Vec<u32> = allowed.into_iter().collect();
+                    let mut v: Vec<String> = allowed.into_iter().collect();
                     v.sort_unstable();
                     v
                 }
             };
-            if out_nums.is_empty() {
-                Ok(None)
-            } else {
-                Ok(Some(out_nums.into_iter().map(|n| n.to_string()).collect()))
+            out.sort();
+            out.dedup();
+            if out.is_empty() {
+                anyhow::bail!("no tasks matched category/task filters");
             }
+            Ok(Some(out))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn base_run_args() -> RunArgs {
+        RunArgs {
+            modes: None,
+            lang: Lang::Rust,
+            hash_only: false,
+            goldens_only: false,
+            force: false,
+            categories: None,
+            tasks: None,
+            providers: None,
+            models: None,
+            model_source: ModelSource::Static,
+            dry_run: false,
+            local_analysis: false,
+            route_overrides: None,
+        }
+    }
+
+    fn base_config(route_overrides: Option<Vec<ModelRoute>>) -> RunConfig {
+        RunConfig {
+            modes: None,
+            hash_only: false,
+            goldens_only: false,
+            lang: Lang::Rust,
+            providers_filter: None,
+            selectors: None,
+            force: false,
+            categories: None,
+            model_filter: None,
+            host: None,
+            api_client: None,
+            dry_run: false,
+            local_analysis: false,
+            dry_run_id: None,
+            route_overrides,
         }
     }
+
+    #[test]
+    fn explicit_models_bypass_remote_model_source() {
+        let mut args = base_run_args();
+        args.model_source = ModelSource::Remote;
+        assert!(should_fetch_remote_routes(&args));
+
+        args.models = Some(vec![ModelGroup {
+            vendor: Vendor::OpenAi,
+            models: vec!["gpt-test".to_string()],
+        }]);
+        assert!(!should_fetch_remote_routes(&args));
+
+        args.dry_run = true;
+        assert!(!should_fetch_remote_routes(&args));
+    }
+
+    #[test]
+    fn filter_routes_uses_remote_route_override() {
+        let remote_route = ModelRoute::new(
+            "Remote Model",
+            Vendor::OpenRouter,
+            "openai/remote-model",
+            Some("openai/remote-model"),
+        );
+        let config = base_config(Some(vec![remote_route]));
+
+        let routes = filter_routes(&config);
+        assert_eq!(routes.len(), 1);
+        assert_eq!(routes[0].display_name, "Remote Model");
+        assert_eq!(routes[0].api_model, "openai/remote-model");
+    }
+
+    #[test]
+    fn filter_routes_does_not_synthesize_duplicate_for_display_name_match() {
+        let remote_route = ModelRoute::new(
+            "DeepSeek V4 Flash",
+            Vendor::DeepSeek,
+            "deepseek-v4-flash",
+            Some("deepseek/deepseek-v4-flash"),
+        );
+        let mut config = base_config(Some(vec![remote_route]));
+        let mut allowed = HashSet::new();
+        allowed.insert("deepseek v4 flash".to_string());
+        let mut filter = HashMap::new();
+        filter.insert(Vendor::DeepSeek, allowed);
+        config.model_filter = Some(filter);
+
+        let routes = filter_routes(&config);
+        assert_eq!(routes.len(), 1);
+        assert_eq!(routes[0].display_name, "DeepSeek V4 Flash");
+        assert_eq!(routes[0].api_model, "deepseek-v4-flash");
+    }
+
+    #[test]
+    fn category_filter_accepts_full_task_ids() {
+        let root = std::env::temp_dir().join(format!(
+            "llm-benchmark-test-{}",
+            std::time::SystemTime::now()
+                .duration_since(std::time::UNIX_EPOCH)
+                .unwrap()
+                .as_nanos()
+        ));
+        fs::create_dir_all(root.join("basics").join("t_001_basic_tables")).unwrap();
+        fs::create_dir_all(root.join("schema").join("t_012_product_type")).unwrap();
+
+        let mut categories = HashSet::new();
+        categories.insert("basics".to_string());
+        let selectors = vec!["t_001_basic_tables".to_string(), "t_012_product_type".to_string()];
+
+        let filtered = apply_category_filter(&root, Some(&categories), Some(&selectors)).unwrap();
+        fs::remove_dir_all(&root).unwrap();
+
+        assert_eq!(filtered, Some(vec!["t_001_basic_tables".to_string()]));
+    }
 }
diff --git a/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs b/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs
index c7a057c4638..8bb0d1ac734 100644
--- a/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs
+++ b/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs
@@ -237,6 +237,12 @@ fn anthropic_max_output_tokens() -> u32 {
 pub fn normalize_anthropic_model(id: &str) -> &str {
     let lid = id.to_ascii_lowercase().replace('_', "-");
     match lid.as_str() {
+        // Opus 4.8
+        "opus-4.8" | "claude-opus-4.8" | "claude-opus-4-8" => "claude-opus-4-8",
+
+        // Sonnet 4.6
+        "sonnet-4.6" | "claude-sonnet-4.6" | "claude-sonnet-4-6" => "claude-sonnet-4-6",
+
         // Sonnet 4.5
         "sonnet-4.5" | "claude-sonnet-4.5" | "claude-sonnet-4-5" => "claude-sonnet-4-5",
         "claude-sonnet-4-5-20250929" => "claude-sonnet-4-5-20250929",
diff --git a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs
index 172beef8ff8..254fe5b8f63 100644
--- a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs
+++ b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs
@@ -8,6 +8,9 @@ pub mod openai;
 pub mod openrouter;
 pub mod xai;
 
+use anyhow::Result;
+use async_trait::async_trait;
+
 pub use anthropic::AnthropicClient;
 pub use deepseek::DeepSeekClient;
 pub use google::GoogleGeminiClient;
@@ -15,3 +18,76 @@ pub use meta::MetaLlamaClient;
 pub use openai::OpenAiClient;
 pub use openrouter::OpenRouterClient;
 pub use xai::XaiGrokClient;
+
+use crate::llm::prompt::BuiltPrompt;
+use crate::llm::types::LlmOutput;
+
+#[derive(Debug, Clone)]
+pub struct ClientPreflight {
+    summary: String,
+}
+
+impl ClientPreflight {
+    pub fn new(summary: impl Into<String>) -> Self {
+        Self {
+            summary: summary.into(),
+        }
+    }
+
+    pub fn summary(&self) -> &str {
+        &self.summary
+    }
+}
+
+#[async_trait]
+pub trait LlmClient: Send + Sync {
+    fn provider_name(&self) -> &'static str;
+
+    async fn preflight(&self, model: &str) -> Result<ClientPreflight> {
+        Ok(ClientPreflight::new(format!(
+            "{} credit preflight not implemented for model '{}'; skipped",
+            self.provider_name(),
+            model
+        )))
+    }
+
+    async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result<LlmOutput>;
+}
+
+macro_rules! impl_direct_llm_client {
+    ($ty:ty, $provider_name:literal) => {
+        #[async_trait]
+        impl LlmClient for $ty {
+            fn provider_name(&self) -> &'static str {
+                $provider_name
+            }
+
+            async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result<LlmOutput> {
+                <$ty>::generate(self, model, prompt).await
+            }
+        }
+    };
+}
+
+impl_direct_llm_client!(OpenAiClient, "OpenAI");
+impl_direct_llm_client!(AnthropicClient, "Anthropic");
+impl_direct_llm_client!(GoogleGeminiClient, "Google");
+impl_direct_llm_client!(XaiGrokClient, "xAI");
+impl_direct_llm_client!(DeepSeekClient, "DeepSeek");
+impl_direct_llm_client!(MetaLlamaClient, "Meta");
+
+#[async_trait]
+impl LlmClient for OpenRouterClient {
+    fn provider_name(&self) -> &'static str {
+        "OpenRouter"
+    }
+
+    async fn preflight(&self, model: &str) -> Result<ClientPreflight> {
+        let status = self.preflight_credits(model).await?;
+        Ok(ClientPreflight::new(status.summary()))
+    }
+
+    async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result<LlmOutput> {
+        OpenRouterClient::generate(self, model, prompt).await
+    }
+}
diff --git a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs
index 623570298af..8e8642ada0b 100644
--- a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs
+++ b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs
@@ -1,5 +1,6 @@
-use anyhow::{anyhow, Context, Result};
-use serde::Serialize;
+use anyhow::{anyhow, bail, Context, Result};
+use serde::{Deserialize, Serialize};
+use std::env;
 
 use super::http::HttpClient;
 use super::oa_compat::OACompatResp;
@@ -35,6 +36,132 @@ impl OpenRouterClient {
         Self { base, api_key, http }
     }
 
+    pub async fn preflight_credits(&self, model: &str) -> Result<OpenRouterCreditStatus> {
+        let key_info = self.fetch_key_info().await?;
+        let min_credits = min_credits_threshold();
+        let mut unchecked_allowed = false;
+        let mut model_probe = None;
+
+        if let Some(remaining) = key_info.limit_remaining
+            && remaining <= min_credits
+        {
+            bail!(
+                "OpenRouter API key has insufficient remaining credits: {:.4} <= {:.4}",
+                remaining,
+                min_credits
+            );
+        }
+
+        let account = match env::var("OPENROUTER_MANAGEMENT_API_KEY")
+            .ok()
+            .filter(|v| !v.trim().is_empty())
+        {
+            Some(key) => Some(self.fetch_account_credits(&key).await?),
+            None => None,
+        };
+
+        if let Some(account) = &account
+            && account.remaining <= min_credits
+        {
+            bail!(
+                "OpenRouter account has insufficient remaining credits: {:.4} <= {:.4}",
+                account.remaining,
+                min_credits
+            );
+        }
+
+        if account.is_none() && key_info.limit_remaining.is_none() {
+            if allow_unchecked_credits() {
+                unchecked_allowed = true;
+            } else {
+                self.probe_model(model).await?;
+                model_probe = Some(model.to_string());
+            }
+        }
+
+        Ok(OpenRouterCreditStatus {
+            key_limit: key_info.limit,
+            key_limit_remaining: key_info.limit_remaining,
+            account_remaining: account.map(|a| a.remaining),
+            min_credits,
+            model_probe,
+            unchecked_allowed,
+        })
+    }
+
+    async fn fetch_key_info(&self) -> Result<OpenRouterKeyInfo> {
+        let url = format!("{}/key", self.base.trim_end_matches('/'));
+        let auth = HttpClient::bearer(&self.api_key);
+        let body = self
+            .http
+            .get_text(&url, &[auth])
+            .await
+            .with_context(|| format!("OpenRouter key preflight GET {}", url))?;
+
+        let resp: OpenRouterKeyResp = serde_json::from_str(&body).context("parse OpenRouter key response")?;
+        Ok(resp.data)
+    }
+
+    async fn fetch_account_credits(&self, management_key: &str) -> Result<OpenRouterAccountCredits> {
+        let url = format!("{}/credits", self.base.trim_end_matches('/'));
+        let auth = HttpClient::bearer(management_key);
+        let body = self
+            .http
+            .get_text(&url, &[auth])
+            .await
+            .with_context(|| format!("OpenRouter account credit preflight GET {}", url))?;
+
+        let resp: OpenRouterCreditsResp = serde_json::from_str(&body).context("parse OpenRouter credits response")?;
+        Ok(OpenRouterAccountCredits {
+            remaining: resp.data.total_credits - resp.data.total_usage,
+        })
+    }
+
+    async fn probe_model(&self, model: &str) -> Result<()> {
+        let url = format!("{}/chat/completions", self.base.trim_end_matches('/'));
+
+        #[derive(Serialize)]
+        struct Req<'a> {
+            model: &'a str,
+            messages: [Msg<'a>; 1],
+            temperature: f32,
+            max_tokens: u32,
+        }
+
+        #[derive(Serialize)]
+        struct Msg<'a> {
+            role: &'a str,
+            content: &'a str,
+        }
+
+        let req = Req {
+            model,
+            messages: [Msg {
+                role: "user",
+                content: "ping",
+            }],
+            temperature: 0.0,
+            max_tokens: 16,
+        };
+        let auth = HttpClient::bearer(&self.api_key);
+        let body = self
+            .http
+            .post_json(&url, &[auth], &req)
+            .await
+            .with_context(|| format!("OpenRouter model probe failed for '{model}'"))?;
+
+        let resp: serde_json::Value = serde_json::from_str(&body).context("parse OpenRouter probe response")?;
+        if let Some(err) = resp.get("error") {
+            let message = err
+                .get("message")
+                .and_then(|message| message.as_str())
+                .unwrap_or("unknown OpenRouter probe error");
+            bail!("OpenRouter model probe failed for '{}': {}", model, message);
+        }
+
+        Ok(())
+    }
+
     pub async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result<LlmOutput> {
         let url = format!("{}/chat/completions", self.base.trim_end_matches('/'));
 
@@ -125,6 +252,100 @@ impl OpenRouterClient {
     }
 }
 
+#[derive(Debug, Clone)]
+pub struct OpenRouterCreditStatus {
+    pub key_limit: Option<f64>,
+    pub key_limit_remaining: Option<f64>,
+    pub account_remaining: Option<f64>,
+    pub min_credits: f64,
+    pub model_probe: Option<String>,
+    pub unchecked_allowed: bool,
+}
+
+impl OpenRouterCreditStatus {
+    pub fn summary(&self) -> String {
+        let key_remaining = match (self.key_limit, self.key_limit_remaining) {
+            (Some(limit), Some(remaining)) => format!("key remaining {remaining:.4}/{limit:.4}"),
+            (Some(limit), None) => format!("key limit {limit:.4}, remaining unknown"),
+            (None, Some(remaining)) => format!("key remaining {remaining:.4}"),
+            (None, None) => "key has no configured limit".to_string(),
+        };
+
+        let credit_status = match self.account_remaining {
+            Some(remaining) => {
+                format!(
+                    "{key_remaining}; account remaining {remaining:.4}; min {:.4}",
+                    self.min_credits
+                )
+            }
+            None => format!(
+                "{key_remaining}; account balance not checked (set OPENROUTER_MANAGEMENT_API_KEY); min {:.4}",
+                self.min_credits
+            ),
+        };
+
+        if let Some(model) = &self.model_probe {
+            format!("{credit_status}; model probe OK for '{model}'")
+        } else if self.unchecked_allowed {
+            format!("{credit_status}; unchecked credits allowed by OPENROUTER_ALLOW_UNCHECKED_CREDITS")
+        } else {
+            credit_status
+        }
+    }
+}
+
+#[derive(Debug, Deserialize)]
+struct OpenRouterKeyResp {
+    data: OpenRouterKeyInfo,
+}
+
+#[derive(Debug, Deserialize)]
+struct OpenRouterKeyInfo {
+    limit: Option<f64>,
+    limit_remaining: Option<f64>,
+}
+
+#[derive(Debug, Deserialize)]
+struct OpenRouterCreditsResp {
+    data: OpenRouterCreditsData,
+}
+
+#[derive(Debug, Deserialize)]
+struct OpenRouterCreditsData {
+    total_credits: f64,
+    total_usage: f64,
+}
+
+#[derive(Debug, Clone)]
+struct OpenRouterAccountCredits {
+    remaining: f64,
+}
+
+fn min_credits_threshold() -> f64 {
+    let openrouter = env::var("OPENROUTER_MIN_CREDITS").ok();
+    let global = env::var("LLM_MIN_CREDITS").ok();
+    parse_min_credits_threshold(openrouter.as_deref(), global.as_deref())
+}
+
+fn allow_unchecked_credits() -> bool {
+    let value = env::var("OPENROUTER_ALLOW_UNCHECKED_CREDITS").ok();
+    parse_env_flag(value.as_deref())
+}
+
+fn parse_min_credits_threshold(openrouter: Option<&str>, global: Option<&str>) -> f64 {
+    [openrouter, global]
+        .into_iter()
+        .flatten()
+        .find_map(|v| v.trim().parse::<f64>().ok())
+        .unwrap_or(0.0)
+}
+
+fn parse_env_flag(value: Option<&str>) -> bool {
+    value
+        .map(|v| matches!(v.trim().to_ascii_lowercase().as_str(), "1" | "true" | "yes" | "y"))
+        .unwrap_or(false)
+}
+
 /// Context limits for models accessed via OpenRouter.
 /// Uses the same limits as direct clients where known,
 /// falls back to a conservative default.
@@ -133,26 +354,44 @@ pub fn openrouter_ctx_limit_tokens(model: &str) -> usize {
 
     // Anthropic
     if m.contains("claude") {
+        if m.contains("4.6")
+            || m.contains("4-6")
+            || m.contains("4.7")
+            || m.contains("4-7")
+            || m.contains("4.8")
+            || m.contains("4-8")
+        {
+            return 1_000_000;
+        }
         return 185_000;
     }
     // OpenAI
+    if m.contains("gpt-5.5") {
+        return 1_050_000;
+    }
     if m.contains("gpt-5") || m.contains("gpt-4.1") {
         return 400_000;
     }
     if m.contains("gpt-4o") || m.contains("gpt-4") {
         return 128_000;
     }
-    // xAI / Grok — leave ~50 k headroom for segments + output on top of trimmed prefix
-    if m.contains("grok-code-fast") {
+    // xAI / Grok
+    if m.contains("grok-build-0.1") || m.contains("grok-code-fast") {
         return 200_000;
     }
+    if m.contains("grok-4.3") {
+        return 1_000_000;
+    }
     if m.contains("grok-4") {
         return 200_000;
     }
     if m.contains("grok") {
         return 90_000;
     }
-    // DeepSeek — hard cap is 131 072 on OpenRouter; leave ~25 k headroom for segments + output
+    // DeepSeek
+    if m.contains("deepseek-v4") {
+        return 1_000_000;
+    }
     if m.contains("deepseek") {
         return 106_000;
     }
@@ -173,3 +412,26 @@ pub fn openrouter_ctx_limit_tokens(model: &str) -> usize {
 
     DEFAULT_CTX_LIMIT
 }
+
+#[cfg(test)]
+mod tests {
+    use super::{parse_env_flag, parse_min_credits_threshold};
+
+    #[test]
+    fn openrouter_min_credits_overrides_global_threshold() {
+        assert_eq!(parse_min_credits_threshold(Some("2.5"), Some("1.0")), 2.5);
+        assert_eq!(parse_min_credits_threshold(None, Some("1.0")), 1.0);
+        assert_eq!(parse_min_credits_threshold(Some("not-a-number"), Some("1.0")), 1.0);
+        assert_eq!(parse_min_credits_threshold(None, None), 0.0);
+    }
+
+    #[test]
+    fn unchecked_credit_escape_hatch_accepts_common_true_values() {
+        for value in ["1", "true", "TRUE", " yes ", "y"] {
+            assert!(parse_env_flag(Some(value)));
+        }
+        for value in [None, Some(""), Some("0"), Some("false"), Some("no")] {
+            assert!(!parse_env_flag(value));
+        }
+    }
+}
diff --git a/tools/xtask-llm-benchmark/src/llm/model_routes.rs b/tools/xtask-llm-benchmark/src/llm/model_routes.rs
index e136976adb6..7f7ae93b66c 100644
--- a/tools/xtask-llm-benchmark/src/llm/model_routes.rs
+++ b/tools/xtask-llm-benchmark/src/llm/model_routes.rs
@@ -13,16 +13,16 @@ pub struct ModelRoute {
 static DEFAULT_ROUTES: LazyLock<Vec<ModelRoute>> = LazyLock::new(|| {
     use Vendor::*;
     vec![
-        // OpenAI: Best GPT-5.2-Codex, Cheaper GPT-5-mini
-        ModelRoute::new("GPT-5.2-Codex", OpenAi, "gpt-5.2-codex", Some("openai/gpt-5.2-codex")),
-        ModelRoute::new("GPT-5-mini", OpenAi, "gpt-5-mini", Some("openai/gpt-5-mini")),
-        // Claude: Best Opus 4.6, Cheaper Sonnet 4.6
-        // Direct API uses dashes (claude-opus-4-6); OpenRouter uses dots (claude-opus-4.6)
+        // OpenAI: Best GPT-5.5, Cheaper GPT-5.4-mini
+        ModelRoute::new("GPT-5.5", OpenAi, "gpt-5.5", Some("openai/gpt-5.5")),
+        ModelRoute::new("GPT-5.4-mini", OpenAi, "gpt-5.4-mini", Some("openai/gpt-5.4-mini")),
+        // Claude: Best Opus 4.8, Cheaper Sonnet 4.6
+        // Direct API uses dashes (claude-opus-4-8); OpenRouter uses dots (claude-opus-4.8)
         ModelRoute::new(
-            "Claude Opus 4.6",
+            "Claude Opus 4.8",
             Anthropic,
-            "claude-opus-4-6",
-            Some("anthropic/claude-opus-4.6"),
+            "claude-opus-4-8",
+            Some("anthropic/claude-opus-4.8"),
         ),
         ModelRoute::new(
             "Claude Sonnet 4.6",
@@ -30,9 +30,9 @@ static DEFAULT_ROUTES: LazyLock<Vec<ModelRoute>> = LazyLock::new(|| {
             "claude-sonnet-4-6",
             Some("anthropic/claude-sonnet-4.6"),
         ),
-        // Grok: Best Grok 4, Cheaper Grok Code
-        ModelRoute::new("Grok 4", Xai, "grok-4", Some("x-ai/grok-4.20-beta")),
-        ModelRoute::new("Grok Code", Xai, "grok-code-fast-1", Some("x-ai/grok-code-fast-1")),
+        // Grok: Best Grok 4.3, coding-specialized Grok Build
+        ModelRoute::new("Grok 4.3", Xai, "grok-4.3", Some("x-ai/grok-4.3")),
+        ModelRoute::new("Grok Build 0.1", Xai, "grok-build-0.1", Some("x-ai/grok-build-0.1")),
         // Gemini: direct via GOOGLE_API_KEY, falls back to OpenRouter if not set
         ModelRoute::new(
             "Gemini 3.1 Pro",
@@ -41,24 +41,23 @@ static DEFAULT_ROUTES: LazyLock<Vec<ModelRoute>> = LazyLock::new(|| {
             Some("google/gemini-3.1-pro-preview"),
         ),
         ModelRoute::new(
-            "Gemini 3 Flash",
+            "Gemini 3.5 Flash",
             Google,
-            "gemini-3-flash-preview",
-            Some("google/gemini-3-flash-preview"),
+            "gemini-3.5-flash",
+            Some("google/gemini-3.5-flash"),
         ),
-        // DeepSeek: Reasoner (thinking), Chat (general)
-        // deepseek-reasoner is listed as deepseek-r1 on OpenRouter
+        // DeepSeek: Pro (highest capability), Flash (cheaper/faster)
         ModelRoute::new(
-            "DeepSeek Reasoner",
+            "DeepSeek V4 Pro",
             DeepSeek,
-            "deepseek-reasoner",
-            Some("deepseek/deepseek-r1"),
+            "deepseek-v4-pro",
+            Some("deepseek/deepseek-v4-pro"),
         ),
         ModelRoute::new(
-            "DeepSeek Chat",
+            "DeepSeek V4 Flash",
             DeepSeek,
-            "deepseek-chat",
-            Some("deepseek/deepseek-chat"),
+            "deepseek-v4-flash",
+            Some("deepseek/deepseek-v4-flash"),
         ),
     ]
 });
diff --git a/tools/xtask-llm-benchmark/src/llm/provider.rs b/tools/xtask-llm-benchmark/src/llm/provider.rs
index 65d587d9526..355f2e19a3e 100644
--- a/tools/xtask-llm-benchmark/src/llm/provider.rs
+++ b/tools/xtask-llm-benchmark/src/llm/provider.rs
@@ -1,8 +1,10 @@
 use anyhow::{Context, Result};
 use async_trait::async_trait;
+use std::collections::HashMap;
 
 use crate::llm::clients::{
-    AnthropicClient, DeepSeekClient, GoogleGeminiClient, MetaLlamaClient, OpenAiClient, OpenRouterClient, XaiGrokClient,
+    AnthropicClient, DeepSeekClient, GoogleGeminiClient, LlmClient, MetaLlamaClient, OpenAiClient, OpenRouterClient,
+    XaiGrokClient,
 };
 use crate::llm::model_routes::ModelRoute;
 use crate::llm::prompt::BuiltPrompt;
@@ -10,19 +12,12 @@ use crate::llm::types::{LlmOutput, Vendor};
 
 #[async_trait]
 pub trait LlmProvider: Send + Sync {
+    async fn preflight_route(&self, route: &ModelRoute, search_enabled: bool) -> Result<()>;
     async fn generate(&self, route: &ModelRoute, prompt: &BuiltPrompt) -> Result<LlmOutput>;
 }
 
 pub struct RouterProvider {
-    pub openai: Option<OpenAiClient>,
-    pub anthropic: Option<AnthropicClient>,
-    pub google: Option<GoogleGeminiClient>,
-    pub xai: Option<XaiGrokClient>,
-    pub deepseek: Option<DeepSeekClient>,
-    pub meta: Option<MetaLlamaClient>,
-    /// OpenRouter client used as a unified fallback when a direct vendor client
-    /// is not configured. Set via `OPENROUTER_API_KEY`.
-    pub openrouter: Option<OpenRouterClient>,
+    clients: HashMap<Vendor, Box<dyn LlmClient>>,
     pub force: Option<Vendor>,
 }
 
@@ -38,111 +33,145 @@ impl RouterProvider {
         openrouter: Option<OpenRouterClient>,
         force: Option<Vendor>,
     ) -> Self {
-        Self {
-            openai,
-            anthropic,
-            google,
-            xai,
-            deepseek,
-            meta,
-            openrouter,
-            force,
+        let mut clients: HashMap<Vendor, Box<dyn LlmClient>> = HashMap::new();
+
+        if let Some(client) = openai {
+            clients.insert(Vendor::OpenAi, Box::new(client));
+        }
+        if let Some(client) = anthropic {
+            clients.insert(Vendor::Anthropic, Box::new(client));
+        }
+        if let Some(client) = google {
+            clients.insert(Vendor::Google, Box::new(client));
+        }
+        if let Some(client) = xai {
+            clients.insert(Vendor::Xai, Box::new(client));
+        }
+        if let Some(client) = deepseek {
+            clients.insert(Vendor::DeepSeek, Box::new(client));
         }
+        if let Some(client) = meta {
+            clients.insert(Vendor::Meta, Box::new(client));
+        }
+        if let Some(client) = openrouter {
+            clients.insert(Vendor::OpenRouter, Box::new(client));
+        }
+
+        Self { clients, force }
     }
 }
 
+struct ResolvedClient<'a> {
+    client: &'a dyn LlmClient,
+    endpoint_name: &'static str,
+    model: String,
+    fallback_from: Option<&'static str>,
+    search_enabled: bool,
+}
+
 #[async_trait]
 impl LlmProvider for RouterProvider {
+    async fn preflight_route(&self, route: &ModelRoute, search_enabled: bool) -> Result<()> {
+        let resolved = self.resolve_client(route, search_enabled)?;
+        let status = resolved.client.preflight(&resolved.model).await.with_context(|| {
+            format!(
+                "{} credit preflight failed for model '{}'",
+                resolved.endpoint_name, resolved.model
+            )
+        })?;
+
+        eprintln!(
+            "[preflight] {} -> {} '{}' OK ({})",
+            route.display_name,
+            resolved.endpoint_name,
+            resolved.model,
+            status.summary()
+        );
+        Ok(())
+    }
+
     async fn generate(&self, route: &ModelRoute, prompt: &BuiltPrompt) -> Result<LlmOutput> {
-        // Web search mode: route all models through OpenRouter with :online suffix.
-        // OpenRouter's :online feature adds Bing-powered web search to any model.
-        if prompt.search_enabled {
-            let cli = self.openrouter.as_ref().context(
-                "Search mode requires OPENROUTER_API_KEY — OpenRouter provides unified web search via :online models",
-            )?;
+        let resolved = self.resolve_client(route, prompt.search_enabled)?;
+
+        if resolved.search_enabled {
+            eprintln!(
+                "[search] {} -> OpenRouter :online model '{}'",
+                route.display_name, resolved.model
+            );
+        } else if let Some(vendor_name) = resolved.fallback_from {
+            eprintln!(
+                "[openrouter] {} client not configured, falling back to OpenRouter for model '{}'",
+                vendor_name, resolved.model
+            );
+        }
+
+        resolved.client.generate(&resolved.model, prompt).await
+    }
+}
+
+impl RouterProvider {
+    fn resolve_client<'a>(&'a self, route: &ModelRoute, search_enabled: bool) -> Result<ResolvedClient<'a>> {
+        if search_enabled {
             let base_model = route
                 .openrouter_model
                 .clone()
                 .unwrap_or_else(|| openrouter_model_id(route.vendor, &route.api_model));
-            let online_model = format!("{base_model}:online");
-            eprintln!(
-                "[search] {} → OpenRouter :online model '{}'",
-                route.display_name, online_model
-            );
-            return cli.generate(&online_model, prompt).await;
+            return self.resolve_openrouter(format!("{base_model}:online"), None, true);
         }
 
         let vendor = self.force.unwrap_or(route.vendor);
 
-        // If vendor is explicitly OpenRouter, or if the direct client isn't configured
-        // but OpenRouter is available, route through OpenRouter.
         if vendor == Vendor::OpenRouter {
-            let cli = self
-                .openrouter
-                .as_ref()
-                .context("OpenRouter client not configured (set OPENROUTER_API_KEY)")?;
             let model = route.openrouter_model.as_deref().unwrap_or(&route.api_model);
-            return cli.generate(model, prompt).await;
+            return self.resolve_openrouter(model.to_string(), None, false);
         }
 
-        // Try direct client first, fall back to OpenRouter if available.
-        match vendor {
-            Vendor::OpenAi => match self.openai.as_ref() {
-                Some(cli) => cli.generate(&route.api_model, prompt).await,
-                None => self.fallback_openrouter(route, prompt, "OpenAI").await,
-            },
-            Vendor::Anthropic => match self.anthropic.as_ref() {
-                Some(cli) => cli.generate(&route.api_model, prompt).await,
-                None => self.fallback_openrouter(route, prompt, "Anthropic").await,
-            },
-            Vendor::Google => match self.google.as_ref() {
-                Some(cli) => cli.generate(&route.api_model, prompt).await,
-                None => self.fallback_openrouter(route, prompt, "Google").await,
-            },
-            Vendor::Xai => match self.xai.as_ref() {
-                Some(cli) => cli.generate(&route.api_model, prompt).await,
-                None => self.fallback_openrouter(route, prompt, "xAI").await,
-            },
-            Vendor::DeepSeek => match self.deepseek.as_ref() {
-                Some(cli) => cli.generate(&route.api_model, prompt).await,
-                None => self.fallback_openrouter(route, prompt, "DeepSeek").await,
-            },
-            Vendor::Meta => match self.meta.as_ref() {
-                Some(cli) => cli.generate(&route.api_model, prompt).await,
-                None => self.fallback_openrouter(route, prompt, "Meta").await,
-            },
-            Vendor::OpenRouter => unreachable!("handled above"),
-        }
+        let direct = self.clients.get(&vendor).map(|client| client.as_ref());
+        self.resolve_direct_or_openrouter(direct, route, vendor)
     }
-}
 
-impl RouterProvider {
-    /// Fall back to the OpenRouter client when a direct vendor client is not configured.
-    async fn fallback_openrouter(
-        &self,
+    fn resolve_direct_or_openrouter<'a>(
+        &'a self,
+        direct: Option<&'a dyn LlmClient>,
         route: &ModelRoute,
-        prompt: &BuiltPrompt,
-        vendor_name: &str,
-    ) -> Result<LlmOutput> {
-        match self.openrouter.as_ref() {
-            Some(cli) => {
-                let or_model = route
-                    .openrouter_model
-                    .clone()
-                    .unwrap_or_else(|| openrouter_model_id(route.vendor, &route.api_model));
-                eprintln!(
-                    "[openrouter] {} client not configured, falling back to OpenRouter for model '{}'",
-                    vendor_name, or_model
-                );
-                cli.generate(&or_model, prompt).await
-            }
-            None => anyhow::bail!(
-                "{} client not configured and no OpenRouter fallback available. \
-                 Set {}_API_KEY or OPENROUTER_API_KEY.",
-                vendor_name,
-                vendor_name.to_ascii_uppercase()
-            ),
+        vendor: Vendor,
+    ) -> Result<ResolvedClient<'a>> {
+        if let Some(client) = direct {
+            return Ok(ResolvedClient {
+                client,
+                endpoint_name: vendor.display_name(),
+                model: route.api_model.clone(),
+                fallback_from: None,
+                search_enabled: false,
+            });
         }
+
+        let model = route
+            .openrouter_model
+            .clone()
+            .unwrap_or_else(|| openrouter_model_id(route.vendor, &route.api_model));
+        self.resolve_openrouter(model, Some(vendor.display_name()), false)
+    }
+
+    fn resolve_openrouter<'a>(
+        &'a self,
+        model: String,
+        fallback_from: Option<&'static str>,
+        search_enabled: bool,
+    ) -> Result<ResolvedClient<'a>> {
+        let client = self
+            .clients
+            .get(&Vendor::OpenRouter)
+            .map(|client| client.as_ref())
+            .context("OpenRouter client not configured (set OPENROUTER_API_KEY)")?;
+
+        Ok(ResolvedClient {
+            client,
+            endpoint_name: "OpenRouter",
+            model,
+            fallback_from,
+            search_enabled,
+        })
     }
 }
 
diff --git a/tools/xtask-llm-benchmark/src/llm/segmentation.rs b/tools/xtask-llm-benchmark/src/llm/segmentation.rs
index 2926852ada0..26bc481e52f 100644
--- a/tools/xtask-llm-benchmark/src/llm/segmentation.rs
+++ b/tools/xtask-llm-benchmark/src/llm/segmentation.rs
@@ -88,14 +88,29 @@ pub fn build_anthropic_messages(
 }
 
 // Provider-specific context limits
-pub fn anthropic_ctx_limit_tokens(_model: &str) -> usize {
-    // Anthropic hard limit is 200k; reserve ~15k for tokenizer variance + system/segments
+pub fn anthropic_ctx_limit_tokens(model: &str) -> usize {
+    let m = model.to_ascii_lowercase();
+
+    // Newer Claude 4.6+ models expose a 1M context window.
+    if m.contains("4-6")
+        || m.contains("4.6")
+        || m.contains("4-7")
+        || m.contains("4.7")
+        || m.contains("4-8")
+        || m.contains("4.8")
+    {
+        return 1_000_000;
+    }
+
+    // Older Anthropic models are 200k; reserve ~15k for tokenizer variance + system/segments.
     185_000
 }
 
 pub fn openai_ctx_limit_tokens(model: &str) -> usize {
     let m = model.to_ascii_lowercase();
-    if m.contains("gpt-5") || m.contains("gpt-4.1") {
+    if m.contains("gpt-5.5") {
+        1_050_000
+    } else if m.contains("gpt-5") || m.contains("gpt-4.1") {
         400_000
     } else {
         128_000
@@ -105,7 +120,13 @@ pub fn openai_ctx_limit_tokens(model: &str) -> usize {
 pub fn deepseek_ctx_limit_tokens(model: &str) -> usize {
     let m = model.to_ascii_lowercase();
 
-    // API limit 128K for deepseek-chat and deepseek-reasoner
+    if m.starts_with("deepseek-v4") {
+        return 1_000_000;
+    }
+    if m.starts_with("deepseek-v3.2") {
+        return 128_000;
+    }
+    // API limit 128K for deepseek-chat and deepseek-reasoner compatibility aliases.
     if m.starts_with("deepseek-reasoner") || m.starts_with("deepseek-r1") {
         return 128_000;
     }
@@ -123,8 +144,8 @@ pub fn deepseek_ctx_limit_tokens(model: &str) -> usize {
 pub fn gemini_ctx_limit_tokens(model: &str) -> usize {
     let m = model.to_ascii_lowercase();
 
-    // Gemini 2.5 series (very large)
-    if m.contains("2.5") && (m.contains("pro") || m.contains("flash")) {
+    // Gemini 3.x and 2.5 series (very large)
+    if (m.contains("3.") || m.contains("2.5")) && (m.contains("pro") || m.contains("flash")) {
         return 1_000_000;
     }
 
@@ -160,9 +181,12 @@ pub fn meta_ctx_limit_tokens(model: &str) -> usize {
 
 pub fn xai_ctx_limit_tokens(model: &str) -> usize {
     let m = model.to_ascii_lowercase();
-    if m.contains("grok-code-fast-1") {
+    if m.contains("grok-build-0.1") || m.contains("grok-code-fast-1") {
         return 256_000;
     }
+    if m.contains("grok-4.3") {
+        return 1_000_000;
+    }
     if m.contains("grok-4") || m.contains("grok-3") {
         return 128_000;
     }
diff --git a/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj b/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj
index ce04141c7a0..f286932badd 100644
--- a/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj
+++ b/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj
@@ -1,9 +1,5 @@
 <Project Sdk="Microsoft.NET.Sdk">
 
-  <!-- Import Runtime build props/targets when using ProjectReference (NuGet auto-imports these; ProjectReference does not) -->
-  <Import Project="{SPACETIME_CSHARP_RUNTIME_DIR}/build/SpacetimeDB.Runtime.props" />
-  <Import Project="{SPACETIME_CSHARP_RUNTIME_DIR}/build/SpacetimeDB.Runtime.targets" />
-
   <PropertyGroup>
     <TargetFramework>net8.0</TargetFramework>
     <RuntimeIdentifier>wasi-wasm</RuntimeIdentifier>
@@ -12,9 +8,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <ProjectReference Include="{SPACETIME_CSHARP_RUNTIME_REF}" />
-    <!-- Codegen is packed into the NuGet Runtime package; with ProjectReference we must add it explicitly -->
-    <ProjectReference Include="{SPACETIME_CSHARP_CODEGEN_REF}" OutputItemType="Analyzer" ReferenceOutputAssembly="false" />
+    <PackageReference Include="SpacetimeDB.Runtime" Version="{SPACETIME_CSHARP_RUNTIME_VERSION}" />
   </ItemGroup>
 
-</Project>
\ No newline at end of file
+</Project>