diff --git a/.github/workflows/llm-benchmark-periodic.yml b/.github/workflows/llm-benchmark-periodic.yml index 40ad2c75fe4..da3af16b609 100644 --- a/.github/workflows/llm-benchmark-periodic.yml +++ b/.github/workflows/llm-benchmark-periodic.yml @@ -2,15 +2,23 @@ name: Periodic LLM benchmarks on: schedule: - # Daily at midnight UTC. Change to '0 */6 * * *' for every 6h, - # or '0 */4 * * *' for every 4h. - - cron: '0 0 * * *' + # Weekly on Monday at midnight UTC. + - cron: '0 0 * * 1' workflow_dispatch: inputs: + model_set: + description: 'Model set to run' + required: false + type: choice + options: + - website_active + - local_defaults + - explicit + default: website_active models: - description: 'Models to run (provider:model format, comma-separated, or "all")' + description: 'Space-separated provider:model groups. Required when model_set=explicit.' required: false - default: 'all' + default: '' languages: description: 'Languages to benchmark (comma-separated: rust,csharp,typescript)' required: false @@ -19,12 +27,24 @@ on: description: 'Modes to run (comma-separated: guidelines,no_context,docs,...)' required: false default: 'guidelines,no_context' + categories: + description: 'Optional benchmark categories to run (comma-separated)' + required: false + default: '' + tasks: + description: 'Optional benchmark task ids/selectors to run (comma-separated)' + required: false + default: '' + dry_run: + description: 'Run benchmarks without uploading results' + required: false + default: 'false' permissions: contents: read concurrency: - group: llm-benchmark-periodic + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} cancel-in-progress: true jobs: @@ -33,10 +53,9 @@ jobs: timeout-minutes: 180 steps: - - name: Checkout master + - name: Checkout repository uses: actions/checkout@v4 with: - ref: master fetch-depth: 1 - uses: dtolnay/rust-toolchain@stable @@ -45,7 +64,7 @@ jobs: - name: Setup .NET SDK uses: actions/setup-dotnet@v4 with: - dotnet-version: "8.0.x" + global-json-file: global.json - name: Install WASI workload env: @@ -55,13 +74,28 @@ jobs: run: | dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel + - name: Pack C# runtime packages + if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'csharp') }} + run: | + dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime + dotnet pack -c Release crates/bindings-csharp/Runtime + - name: Set up Node.js + if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }} uses: actions/setup-node@v4 with: node-version: 22 - name: Install pnpm + if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }} uses: ./.github/actions/setup-pnpm + with: + run_install: true + + - name: Build TypeScript SDK + if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }} + run: pnpm build + working-directory: crates/bindings-typescript - name: Build llm-benchmark tool run: cargo install --path tools/xtask-llm-benchmark --locked @@ -78,30 +112,87 @@ jobs: OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + LLM_VENDOR: openrouter LLM_BENCHMARK_API_KEY: ${{ secrets.LLM_BENCHMARK_API_KEY }} LLM_BENCHMARK_UPLOAD_URL: ${{ secrets.LLM_BENCHMARK_UPLOAD_URL }} + DOTNET_MULTILEVEL_LOOKUP: "0" + DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home + DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1" MSBUILDDISABLENODEREUSE: "1" DOTNET_CLI_USE_MSBUILD_SERVER: "0" + LLM_BENCH_CSHARP_CONCURRENCY: "1" INPUT_LANGUAGES: ${{ inputs.languages || 'rust,csharp,typescript' }} - INPUT_MODELS: ${{ inputs.models || 'all' }} + INPUT_MODEL_SET: ${{ inputs.model_set || 'website_active' }} + INPUT_MODELS: ${{ inputs.models || '' }} INPUT_MODES: ${{ inputs.modes || 'guidelines,no_context' }} + INPUT_CATEGORIES: ${{ inputs.categories || '' }} + INPUT_TASKS: ${{ inputs.tasks || '' }} + INPUT_DRY_RUN: ${{ inputs.dry_run || 'false' }} run: | LANGS="$INPUT_LANGUAGES" + MODEL_SET="$INPUT_MODEL_SET" MODELS="$INPUT_MODELS" MODES="$INPUT_MODES" + CATEGORIES="$INPUT_CATEGORIES" + TASKS="$INPUT_TASKS" + DRY_RUN="$INPUT_DRY_RUN" + + case "$MODEL_SET" in + website_active) + if [ -n "$MODELS" ]; then + echo "::error::models is only valid when model_set=explicit" + exit 1 + fi + ;; + local_defaults) + if [ -n "$MODELS" ]; then + echo "::error::models is only valid when model_set=explicit" + exit 1 + fi + ;; + explicit) + if [ -z "$MODELS" ]; then + echo "::error::models is required when model_set=explicit" + exit 1 + fi + read -r -a MODEL_ARGS <<< "$MODELS" + ;; + *) + echo "::error::unknown model_set '$MODEL_SET' (expected website_active, local_defaults, or explicit)" + exit 1 + ;; + esac SUCCEEDED=0 FAILED=0 for LANG in $(echo "$LANGS" | tr ',' ' '); do - if [ "$MODELS" = "all" ]; then - if llm_benchmark run --lang "$LANG" --modes "$MODES"; then + EXTRA_ARGS=() + if [ -n "$CATEGORIES" ]; then + EXTRA_ARGS+=(--categories "$CATEGORIES") + fi + if [ -n "$TASKS" ]; then + EXTRA_ARGS+=(--tasks "$TASKS") + fi + if [ "$DRY_RUN" = "true" ]; then + EXTRA_ARGS+=(--dry-run) + fi + + if [ "$MODEL_SET" = "website_active" ]; then + if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote "${EXTRA_ARGS[@]}"; then + SUCCEEDED=$((SUCCEEDED + 1)) + else + echo "::warning::Benchmark run failed for lang=$LANG" + FAILED=$((FAILED + 1)) + fi + elif [ "$MODEL_SET" = "local_defaults" ]; then + if llm_benchmark run --lang "$LANG" --modes "$MODES" "${EXTRA_ARGS[@]}"; then SUCCEEDED=$((SUCCEEDED + 1)) else echo "::warning::Benchmark run failed for lang=$LANG" FAILED=$((FAILED + 1)) fi else - if llm_benchmark run --lang "$LANG" --modes "$MODES" --models "$MODELS"; then + if llm_benchmark run --lang "$LANG" --modes "$MODES" --models "${MODEL_ARGS[@]}" "${EXTRA_ARGS[@]}"; then SUCCEEDED=$((SUCCEEDED + 1)) else echo "::warning::Benchmark run failed for lang=$LANG models=$MODELS" @@ -110,7 +201,7 @@ jobs: fi done echo "Benchmark runs: $SUCCEEDED succeeded, $FAILED failed" - if [ "$SUCCEEDED" -eq 0 ] && [ "$FAILED" -gt 0 ]; then - echo "::error::All benchmark runs failed" + if [ "$FAILED" -gt 0 ]; then + echo "::error::$FAILED benchmark run(s) failed" exit 1 fi diff --git a/.github/workflows/llm-benchmark-validate-goldens.yml b/.github/workflows/llm-benchmark-validate-goldens.yml index 17384a654e3..a2d2ef87a3e 100644 --- a/.github/workflows/llm-benchmark-validate-goldens.yml +++ b/.github/workflows/llm-benchmark-validate-goldens.yml @@ -2,15 +2,26 @@ name: Validate LLM benchmark golden answers on: schedule: - # Nightly at 2 AM UTC - - cron: '0 2 * * *' - workflow_dispatch: {} + # Weekly on Monday at 2 AM UTC. + - cron: '0 2 * * 1' + workflow_dispatch: + inputs: + lang: + description: 'Language to validate for manual smoke runs' + required: false + type: choice + default: all + options: + - all + - rust + - csharp + - typescript permissions: contents: read concurrency: - group: llm-benchmark-validate-goldens + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} cancel-in-progress: true jobs: @@ -21,13 +32,12 @@ jobs: strategy: fail-fast: false matrix: - lang: [rust, csharp, typescript] + lang: ${{ fromJSON(github.event_name == 'workflow_dispatch' && inputs.lang != 'all' && format('["{0}"]', inputs.lang) || '["rust","csharp","typescript"]') }} steps: - - name: Checkout master + - name: Checkout repository uses: actions/checkout@v4 with: - ref: master fetch-depth: 1 - uses: dtolnay/rust-toolchain@stable @@ -37,7 +47,7 @@ jobs: if: matrix.lang == 'csharp' uses: actions/setup-dotnet@v4 with: - dotnet-version: "8.0.x" + global-json-file: global.json - name: Install WASI workload if: matrix.lang == 'csharp' @@ -48,6 +58,12 @@ jobs: run: | dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel + - name: Pack C# runtime packages + if: matrix.lang == 'csharp' + run: | + dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime + dotnet pack -c Release crates/bindings-csharp/Runtime + - name: Set up Node.js if: matrix.lang == 'typescript' uses: actions/setup-node@v4 @@ -57,6 +73,13 @@ jobs: - name: Install pnpm if: matrix.lang == 'typescript' uses: ./.github/actions/setup-pnpm + with: + run_install: true + + - name: Build TypeScript SDK + if: matrix.lang == 'typescript' + run: pnpm build + working-directory: crates/bindings-typescript - name: Build llm-benchmark tool run: cargo install --path tools/xtask-llm-benchmark --locked @@ -70,7 +93,11 @@ jobs: - name: Validate golden answers (${{ matrix.lang }}) env: + DOTNET_MULTILEVEL_LOOKUP: "0" + DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home + DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1" MSBUILDDISABLENODEREUSE: "1" DOTNET_CLI_USE_MSBUILD_SERVER: "0" + LLM_BENCH_CSHARP_CONCURRENCY: "1" run: | llm_benchmark run --goldens-only --lang ${{ matrix.lang }} diff --git a/tools/xtask-llm-benchmark/src/api/client.rs b/tools/xtask-llm-benchmark/src/api/client.rs index edc61756152..0b43ccb5bac 100644 --- a/tools/xtask-llm-benchmark/src/api/client.rs +++ b/tools/xtask-llm-benchmark/src/api/client.rs @@ -1,34 +1,118 @@ -use anyhow::{Context, Result}; +use anyhow::{anyhow, Context, Result}; use serde_json::json; use crate::bench::normalize::{canonical_mode, normalize_model_names}; use crate::bench::types::{Results, RunOutcome}; +use crate::llm::types::Vendor; +use crate::llm::ModelRoute; + +#[derive(Debug)] +struct RemoteModelRouteRow { + display_name: String, + vendor: String, + api_model: String, + openrouter_model: Option, + active: Option, + available: Option, +} + +fn read_string_field(row: &serde_json::Map, keys: &[&str]) -> Option { + keys.iter() + .find_map(|key| row.get(*key).and_then(|value| value.as_str())) + .map(str::to_string) +} + +fn read_bool_field(row: &serde_json::Map, keys: &[&str]) -> Option { + keys.iter() + .find_map(|key| row.get(*key).and_then(|value| value.as_bool())) +} + +fn parse_model_route_value(value: serde_json::Value) -> Result { + let row = value + .as_object() + .ok_or_else(|| anyhow!("remote model row must be an object"))?; + + Ok(RemoteModelRouteRow { + display_name: read_string_field(row, &["display_name", "displayName", "name"]).unwrap_or_default(), + vendor: read_string_field(row, &["vendor"]).unwrap_or_default(), + api_model: read_string_field(row, &["api_model", "apiModel"]).unwrap_or_default(), + openrouter_model: read_string_field(row, &["openrouter_model", "openrouterModel"]), + active: read_bool_field(row, &["active"]), + available: read_bool_field(row, &["available"]), + }) +} + +fn parse_model_route_row(row: RemoteModelRouteRow) -> Result> { + if row.active == Some(false) || row.available == Some(false) { + return Ok(None); + } + + let vendor = Vendor::parse(&row.vendor).ok_or_else(|| anyhow!("unknown model vendor '{}'", row.vendor))?; + let display_name = row.display_name.trim(); + let api_model = row.api_model.trim(); + + if display_name.is_empty() { + anyhow::bail!("remote model row is missing display_name"); + } + if api_model.is_empty() { + anyhow::bail!("remote model row '{}' is missing api_model", display_name); + } + + Ok(Some(ModelRoute::new( + display_name, + vendor, + api_model, + row.openrouter_model.as_deref().filter(|s| !s.trim().is_empty()), + ))) +} + +pub fn parse_model_routes_response(body: &serde_json::Value) -> Result> { + let models = body.get("models").unwrap_or(body); + let rows: Vec = + serde_json::from_value(models.clone()).context("parse llm benchmark model rows")?; + + let mut routes = Vec::new(); + for row in rows.into_iter().map(parse_model_route_value) { + let row = row?; + if let Some(route) = parse_model_route_row(row)? { + routes.push(route); + } + } + + if routes.is_empty() { + anyhow::bail!("no active available LLM benchmark models returned by website"); + } + + Ok(routes) +} /// HTTP client for the SpacetimeDB LLM benchmark API (spacetime-web Postgres). /// -/// Supports two POST endpoints that already exist in spacetime-web: -/// - `POST /api/llm-benchmark-upload` — upload benchmark results -/// - `POST /api/llm-benchmark-tasks` — upload task catalog +/// Supports endpoints owned by spacetime-web: +/// - `POST /api/llm-benchmark-upload` - upload benchmark results +/// - `POST /api/llm-benchmark-tasks` - upload task catalog +/// - `GET /api/llm-benchmark-models?active=true` - fetch active benchmark models #[derive(Clone)] pub struct ApiClient { - client: reqwest::blocking::Client, base_url: String, api_key: String, } impl ApiClient { pub fn new(base_url: &str, api_key: &str) -> Result { - let client = reqwest::blocking::Client::builder() - .timeout(std::time::Duration::from_secs(120)) - .build() - .context("failed to build HTTP client")?; Ok(Self { - client, base_url: base_url.trim_end_matches('/').to_string(), api_key: api_key.to_string(), }) } + fn client(&self) -> Result { + reqwest::blocking::Client::builder() + .timeout(std::time::Duration::from_secs(120)) + .build() + .context("failed to build HTTP client") + } + /// Build from environment variables `LLM_BENCHMARK_UPLOAD_URL` and `LLM_BENCHMARK_API_KEY`. /// Returns `None` if `LLM_BENCHMARK_UPLOAD_URL` is not set. pub fn from_env() -> Result> { @@ -71,6 +155,7 @@ impl ApiClient { normalize_model_names(&mut results); let url = format!("{}/api/llm-benchmark-upload", self.base_url); + let client = self.client()?; let mut total_uploaded = 0usize; for lang_entry in &results.languages { @@ -92,8 +177,7 @@ impl ApiClient { "models": models_json, }); - let resp = self - .client + let resp = client .post(&url) .header("Authorization", format!("Bearer {}", self.api_key)) .header("Content-Type", "application/json") @@ -113,7 +197,7 @@ impl ApiClient { let status = resp.status(); let body = resp.text().unwrap_or_default(); anyhow::bail!( - "upload failed for {}/{}: {} — {}", + "upload failed for {}/{}: {} - {}", lang_entry.lang, mode_entry.mode, status, @@ -126,6 +210,26 @@ impl ApiClient { Ok(total_uploaded) } + /// Fetch active/available benchmark models from the website model registry. + pub fn fetch_model_routes(&self) -> Result> { + let url = format!("{}/api/llm-benchmark-models?active=true", self.base_url); + let resp = self + .client()? + .get(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .send() + .context("fetch LLM benchmark models failed")?; + + if resp.status().is_success() { + let body: serde_json::Value = resp.json().context("parse model registry response")?; + parse_model_routes_response(&body) + } else { + let status = resp.status(); + let body = resp.text().unwrap_or_default(); + anyhow::bail!("fetch LLM benchmark models failed: {} - {}", status, body); + } + } + /// Upload the task catalog to `POST /api/llm-benchmark-tasks`, derived from /// the benchmarks directory structure on disk. pub fn upload_task_catalog(&self, bench_root: &std::path::Path) -> Result { @@ -207,7 +311,7 @@ impl ApiClient { let payload = json!({ "categories": categories }); let resp = self - .client + .client()? .post(&url) .header("Authorization", format!("Bearer {}", self.api_key)) .header("Content-Type", "application/json") @@ -239,7 +343,7 @@ impl ApiClient { let url = format!("{}/api/llm-benchmark-results?{}", self.base_url, params.join("&")); let resp = self - .client + .client()? .get(&url) .header("Authorization", format!("Bearer {}", self.api_key)) .send() @@ -282,7 +386,7 @@ impl ApiClient { let url = format!("{}/api/llm-benchmark-results?{}", self.base_url, params.join("&")); let resp = self - .client + .client()? .get(&url) .header("Authorization", format!("Bearer {}", self.api_key)) .send() @@ -316,7 +420,7 @@ impl ApiClient { let url = format!("{}/api/llm-benchmark-upload", self.base_url); let resp = self - .client + .client()? .post(&url) .header("Authorization", format!("Bearer {}", self.api_key)) .header("Content-Type", "application/json") @@ -334,3 +438,67 @@ impl ApiClient { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parses_active_available_model_routes() { + let body = json!({ + "models": [ + { + "displayName": "GPT Test", + "vendor": "openai", + "apiModel": "gpt-test", + "openrouterModel": "openai/gpt-test", + "active": true, + "available": true + }, + { + "displayName": "Inactive", + "vendor": "openai", + "apiModel": "inactive", + "active": false, + "available": true + }, + { + "displayName": "Unavailable", + "vendor": "openai", + "apiModel": "unavailable", + "active": true, + "available": false + } + ] + }); + + let routes = parse_model_routes_response(&body).unwrap(); + assert_eq!(routes.len(), 1); + assert_eq!(routes[0].display_name, "GPT Test"); + assert_eq!(routes[0].vendor, Vendor::OpenAi); + assert_eq!(routes[0].api_model, "gpt-test"); + assert_eq!(routes[0].openrouter_model.as_deref(), Some("openai/gpt-test")); + } + + #[test] + fn parses_snake_case_model_route_fields() { + let body = json!({ + "models": [ + { + "display_name": "GPT Test", + "vendor": "openai", + "api_model": "gpt-test", + "openrouter_model": "openai/gpt-test", + "active": true, + "available": true + } + ] + }); + + let routes = parse_model_routes_response(&body).unwrap(); + assert_eq!(routes.len(), 1); + assert_eq!(routes[0].display_name, "GPT Test"); + assert_eq!(routes[0].api_model, "gpt-test"); + assert_eq!(routes[0].openrouter_model.as_deref(), Some("openai/gpt-test")); + } +} diff --git a/tools/xtask-llm-benchmark/src/bench/analysis.rs b/tools/xtask-llm-benchmark/src/bench/analysis.rs index 0234cba1b8f..cb23fbb6cf5 100644 --- a/tools/xtask-llm-benchmark/src/bench/analysis.rs +++ b/tools/xtask-llm-benchmark/src/bench/analysis.rs @@ -27,10 +27,10 @@ pub async fn run_analysis( let prompt = build_prompt(lang, mode, model_name, bench_root, &failures); let route = ModelRoute::new( - "gpt-4.1-mini", + "gpt-5.4-mini", crate::llm::types::Vendor::OpenAi, - "gpt-4.1-mini", - Some("openai/gpt-4.1-mini"), + "gpt-5.4-mini", + Some("openai/gpt-5.4-mini"), ); let built = BuiltPrompt { diff --git a/tools/xtask-llm-benchmark/src/bench/publishers.rs b/tools/xtask-llm-benchmark/src/bench/publishers.rs index 68775ff631c..b7fb74c6936 100644 --- a/tools/xtask-llm-benchmark/src/bench/publishers.rs +++ b/tools/xtask-llm-benchmark/src/bench/publishers.rs @@ -1,12 +1,16 @@ use crate::bench::utils::sanitize_db_name; -use anyhow::{bail, Result}; +use anyhow::{bail, Context, Result}; use regex::Regex; use std::borrow::Cow; use std::env; use std::fs; use std::path::{Path, PathBuf}; use std::process::Command; -use std::sync::LazyLock; +use std::sync::{ + atomic::{AtomicU64, Ordering}, + LazyLock, +}; +use std::time::{SystemTime, UNIX_EPOCH}; fn workspace_root() -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")) @@ -31,6 +35,163 @@ fn pnpm_minimum_release_age() -> Result { .ok_or_else(|| anyhow::anyhow!("pnpm-workspace.yaml is missing minimumReleaseAge")) } +fn path_entries() -> Vec { + #[cfg(windows)] + let path = env::var_os("Path").or_else(|| env::var_os("PATH")); + #[cfg(not(windows))] + let path = env::var_os("PATH"); + + path.map(|path| env::split_paths(&path).collect()).unwrap_or_default() +} + +fn command_path_candidates(name: &str) -> Vec { + #[cfg(windows)] + { + let path = Path::new(name); + if path.extension().is_some() { + vec![name.to_string()] + } else { + vec![ + format!("{name}.cmd"), + format!("{name}.exe"), + format!("{name}.bat"), + name.to_string(), + ] + } + } + #[cfg(not(windows))] + { + vec![name.to_string()] + } +} + +fn resolve_command_on_path(name: &str) -> Option { + for dir in path_entries() { + for candidate in command_path_candidates(name) { + let path = dir.join(candidate); + if path.is_file() { + return Some(path); + } + } + } + None +} + +fn configured_nodejs_dir() -> Option { + env::var("NODEJS_DIR") + .ok() + .map(|s| s.trim().trim_matches('"').trim().to_string()) + .filter(|s| !s.is_empty()) + .map(PathBuf::from) +} + +fn pnpm_in_dir(dir: &Path) -> Option { + #[cfg(windows)] + { + for candidate in ["pnpm.cmd", "pnpm.exe", "pnpm.bat"] { + let path = dir.join(candidate); + if path.is_file() { + return Some(path); + } + } + None + } + #[cfg(not(windows))] + { + let path = dir.join("pnpm"); + path.is_file().then_some(path) + } +} + +fn node_in_dir(dir: &Path) -> Option { + #[cfg(windows)] + let path = dir.join("node.exe"); + #[cfg(not(windows))] + let path = dir.join("node"); + + path.is_file().then_some(path) +} + +fn resolve_node_exe(nodejs_dir: Option<&Path>) -> Option { + nodejs_dir + .and_then(node_in_dir) + .or_else(|| resolve_command_on_path("node")) + .or_else(|| { + env::var("NVM_SYMLINK") + .ok() + .map(PathBuf::from) + .and_then(|dir| node_in_dir(&dir)) + }) +} + +struct CliRootDir { + path: PathBuf, +} + +impl CliRootDir { + fn path(&self) -> &Path { + &self.path + } +} + +impl Drop for CliRootDir { + fn drop(&mut self) { + let _ = fs::remove_dir_all(&self.path); + } +} + +fn isolated_cli_root() -> Result { + static COUNTER: AtomicU64 = AtomicU64::new(0); + + for _ in 0..16 { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|duration| duration.as_nanos()) + .unwrap_or(0); + let id = COUNTER.fetch_add(1, Ordering::Relaxed); + let path = env::temp_dir().join(format!("stdb-llm-cli-{}-{nanos}-{id}", std::process::id())); + match fs::create_dir(&path) { + Ok(()) => return Ok(CliRootDir { path }), + Err(error) if error.kind() == std::io::ErrorKind::AlreadyExists => continue, + Err(error) => return Err(error.into()), + } + } + + bail!("failed to create isolated SpacetimeDB CLI root directory"); +} + +fn spacetime_cmd(cli_root: &CliRootDir) -> Command { + let mut cmd = Command::new("spacetime"); + cmd.arg("--root-dir").arg(cli_root.path()); + cmd +} + +fn pnpm_cjs_for_cmd(pnpm: &Path) -> Option { + #[cfg(windows)] + { + let is_cmd = pnpm + .extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| ext.eq_ignore_ascii_case("cmd")); + if !is_cmd { + return None; + } + + let cjs = pnpm + .parent()? + .join("node_modules") + .join("pnpm") + .join("bin") + .join("pnpm.cjs"); + cjs.is_file().then_some(cjs) + } + #[cfg(not(windows))] + { + let _ = pnpm; + None + } +} + /// Strip ANSI escape codes (color codes) from a string fn strip_ansi_codes(s: &str) -> Cow<'_, str> { static ANSI_RE: LazyLock = LazyLock::new(|| { @@ -50,14 +211,14 @@ pub trait Publisher: Send + Sync { /// Check if the process was killed by a signal (e.g., SIGSEGV = 11) #[cfg(unix)] -fn was_signal_killed(status: &std::process::ExitStatus) -> bool { +fn signal_killed_by(status: &std::process::ExitStatus) -> Option { use std::os::unix::process::ExitStatusExt; - status.signal().is_some() + status.signal() } #[cfg(not(unix))] -fn was_signal_killed(_status: &std::process::ExitStatus) -> bool { - false +fn signal_killed_by(_status: &std::process::ExitStatus) -> Option { + None } /// Check if the failure is a transient error that should be retried. @@ -73,6 +234,8 @@ fn is_transient_build_error(stderr: &str, stdout: &str) -> bool { // trying to extract the same tarball simultaneously || (combined.contains("wasi-sdk") && combined.contains("tar")) || (combined.contains("MSB3073") && combined.contains("exited with code 2")) + // dotnet can crash below spacetime while spacetime exits 1. + || combined.contains("code Result<()> { @@ -119,13 +282,14 @@ fn run_with_retry(cmd: &mut Command, label: &str, max_retries: u32) -> Result<() let stderr = strip_ansi_codes(&stderr_raw); let stdout = strip_ansi_codes(&stdout_raw); - // Retry on signal kills (like SIGSEGV) or transient build errors - let should_retry = was_signal_killed(&out.status) || is_transient_build_error(&stderr, &stdout); + // Retry on signal kills (like SIGSEGV) or transient build errors. + let signal = signal_killed_by(&out.status); + let should_retry = signal.is_some() || is_transient_build_error(&stderr, &stdout); if should_retry && attempt < max_retries { - let reason = if was_signal_killed(&out.status) { - "signal kill" + let reason = if let Some(signal) = signal { + format!("signal {signal}") } else { - "transient build error" + "transient build error".to_string() }; eprintln!("⚠️ {label}: {reason} detected, will retry..."); last_error = Some(format!( @@ -162,6 +326,19 @@ impl DotnetPublisher { } Ok(()) } + + fn configure_dotnet_env(cmd: &mut Command) -> &mut Command { + cmd.env("DOTNET_CLI_TELEMETRY_OPTOUT", "1") + .env("DOTNET_NOLOGO", "1") + // The CI runner's .NET install can crash while formatting localized + // DateTime/TimeZoneInfo data before publish starts. Force invariant + // globalization so generated C# module publish reaches MSBuild. + .env("DOTNET_SYSTEM_GLOBALIZATION_INVARIANT", "1") + // Prevent MSBuild node reuse issues that cause "Pipe is broken" errors + // when running multiple dotnet builds in parallel. + .env("MSBUILDDISABLENODEREUSE", "1") + .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0") + } } impl Publisher for DotnetPublisher { @@ -174,27 +351,23 @@ impl Publisher for DotnetPublisher { Self::ensure_csproj(source)?; let db = sanitize_db_name(module_name); + let source = source + .canonicalize() + .with_context(|| format!("failed to resolve C# source path {}", source.display()))?; + let cli_root = isolated_cli_root()?; - let mut cmd = Command::new("spacetime"); - cmd.arg("build") - .current_dir(source) - .env("DOTNET_CLI_TELEMETRY_OPTOUT", "1") - .env("DOTNET_NOLOGO", "1") - // Prevent MSBuild node reuse issues that cause "Pipe is broken" errors - // when running multiple dotnet builds in parallel. - .env("MSBUILDDISABLENODEREUSE", "1") - .env("DOTNET_CLI_USE_MSBUILD_SERVER", "0"); - run(&mut cmd, "spacetime build (csharp)")?; - - let mut pubcmd = Command::new("spacetime"); + let mut pubcmd = spacetime_cmd(&cli_root); pubcmd .arg("publish") .arg("-c") .arg("-y") .arg("--server") .arg(host_url) + .arg("--module-path") + .arg(&source) .arg(&db) - .current_dir(source); + .current_dir(&source); + Self::configure_dotnet_env(&mut pubcmd); run(&mut pubcmd, "spacetime publish (csharp)")?; Ok(()) @@ -228,10 +401,11 @@ impl Publisher for SpacetimeRustPublisher { // sanitize db + server let db = sanitize_db_name(module_name); + let cli_root = isolated_cli_root()?; // 2) Publish run( - Command::new("spacetime") + spacetime_cmd(&cli_root) .arg("publish") .arg("-c") .arg("-y") @@ -271,51 +445,34 @@ impl Publisher for TypeScriptPublisher { Self::ensure_package_json(source)?; let db = sanitize_db_name(module_name); + let cli_root = isolated_cli_root()?; // Install dependencies (--ignore-workspace to avoid parent workspace interference). - // If NODEJS_DIR is set (e.g. nvm4w on Windows), use full path to pnpm so spawn finds it. - let pnpm_exe = env::var("NODEJS_DIR") - .ok() - .map(|s| s.trim().trim_matches('"').trim().to_string()) - .filter(|s| !s.is_empty()) - .map(PathBuf::from) - .and_then(|dir| { - #[cfg(windows)] - { - let pnpm_cmd = dir.join("pnpm.cmd"); - let pnpm_exe_path = dir.join("pnpm.exe"); - if pnpm_cmd.is_file() { - eprintln!("[pnpm] using NODEJS_DIR: {} (pnpm.cmd)", dir.display()); - Some(pnpm_cmd) - } else if pnpm_exe_path.is_file() { - eprintln!("[pnpm] using NODEJS_DIR: {} (pnpm.exe)", dir.display()); - Some(pnpm_exe_path) - } else { - eprintln!( - "[pnpm] NODEJS_DIR set to {} but pnpm.cmd/pnpm.exe not found there, using PATH", - dir.display() - ); - None - } - } - #[cfg(not(windows))] - { - let pnpm = dir.join("pnpm"); - if pnpm.is_file() { - eprintln!("[pnpm] using NODEJS_DIR: {} (pnpm)", dir.display()); - Some(pnpm) - } else { - eprintln!( - "[pnpm] NODEJS_DIR set to {} but pnpm not found there, using PATH", - dir.display() - ); - None - } - } - }); - let mut pnpm_cmd = match &pnpm_exe { - Some(p) => Command::new(p), - None => Command::new("pnpm"), + let nodejs_dir = configured_nodejs_dir(); + let pnpm_exe = nodejs_dir + .as_deref() + .and_then(pnpm_in_dir) + .or_else(|| resolve_command_on_path("pnpm")); + if let Some(ref pnpm) = pnpm_exe { + eprintln!("[pnpm] using {}", pnpm.display()); + } else if let Some(ref dir) = nodejs_dir { + eprintln!( + "[pnpm] NODEJS_DIR set to {} but pnpm not found there or on PATH", + dir.display() + ); + } + let node_exe = resolve_node_exe(nodejs_dir.as_deref()); + let pnpm_cjs = pnpm_exe.as_deref().and_then(pnpm_cjs_for_cmd); + let mut pnpm_cmd = if let (Some(node), Some(cjs)) = (&node_exe, pnpm_cjs) { + eprintln!("[pnpm] invoking {} {}", node.display(), cjs.display()); + let mut cmd = Command::new(node); + cmd.arg(cjs); + cmd + } else { + match &pnpm_exe { + Some(p) => Command::new(p), + None => Command::new("pnpm"), + } }; pnpm_cmd .arg("install") @@ -325,30 +482,62 @@ impl Publisher for TypeScriptPublisher { // This install runs in a materialized project with workspace config // ignored, so pass the repo's pnpm package-age policy explicitly. .env("npm_config_minimum_release_age", pnpm_minimum_release_age()?); - // When using NODEJS_DIR, prepend it to PATH so pnpm.cmd can find node. - if let Some(ref dir) = pnpm_exe - && let Some(parent) = dir.parent() + let mut prepend_paths = Vec::new(); + if let Some(dir) = nodejs_dir { + prepend_paths.push(dir); + } + if let Some(ref pnpm) = pnpm_exe + && let Some(parent) = pnpm.parent() + { + prepend_paths.push(parent.to_path_buf()); + } + if let Some(node) = node_exe + && let Some(parent) = node.parent() { - let mut paths: Vec = env::split_paths(&env::var("PATH").unwrap_or_default()).collect(); - paths.insert(0, parent.to_path_buf()); - if let Ok(new_path) = env::join_paths(paths) { - pnpm_cmd.env("PATH", new_path); + prepend_paths.push(parent.to_path_buf()); + } + let child_path = if !prepend_paths.is_empty() { + let mut paths = path_entries(); + for path in prepend_paths.into_iter().rev() { + if !paths.iter().any(|existing| existing == &path) { + paths.insert(0, path); + } } + env::join_paths(paths).ok() + } else { + None + }; + if let Some(ref new_path) = child_path { + #[cfg(windows)] + { + pnpm_cmd.env_remove("PATH"); + pnpm_cmd.env("Path", new_path); + } + #[cfg(not(windows))] + pnpm_cmd.env("PATH", new_path); } run(&mut pnpm_cmd, "pnpm install (typescript)")?; // Publish (spacetime CLI handles TypeScript compilation internally) - run( - Command::new("spacetime") - .arg("publish") - .arg("-c") - .arg("-y") - .arg("--server") - .arg(host_url) - .arg(&db) - .current_dir(source), - "spacetime publish (typescript)", - )?; + let mut publish_cmd = spacetime_cmd(&cli_root); + publish_cmd + .arg("publish") + .arg("-c") + .arg("-y") + .arg("--server") + .arg(host_url) + .arg(&db) + .current_dir(source); + if let Some(ref new_path) = child_path { + #[cfg(windows)] + { + publish_cmd.env_remove("PATH"); + publish_cmd.env("Path", new_path); + } + #[cfg(not(windows))] + publish_cmd.env("PATH", new_path); + } + run(&mut publish_cmd, "spacetime publish (typescript)")?; Ok(()) } diff --git a/tools/xtask-llm-benchmark/src/bench/runner.rs b/tools/xtask-llm-benchmark/src/bench/runner.rs index 42acd77a70c..2536b5e5fe1 100644 --- a/tools/xtask-llm-benchmark/src/bench/runner.rs +++ b/tools/xtask-llm-benchmark/src/bench/runner.rs @@ -473,6 +473,23 @@ async fn maybe_generate_analysis(cfg: &BenchRunContext<'_>, outcomes: &[RunOutco Ok(analysis) } +async fn upload_batch_for_context( + cfg: &BenchRunContext<'_>, + outcomes: &[RunOutcome], + analysis: Option<&str>, +) -> Result<()> { + if let Some(api) = cfg.api_client.clone() { + let mode = cfg.mode.to_string(); + let outcomes = outcomes.to_vec(); + let analysis = analysis.map(str::to_string); + tokio::task::spawn_blocking(move || api.upload_batch(&mode, &outcomes, analysis.as_deref())).await??; + } else { + eprintln!("[runner] no API client configured; skipping upload"); + } + + Ok(()) +} + pub async fn run_all_for_model_async_for_lang(cfg: &BenchRunContext<'_>) -> Result> { let total_wall = Instant::now(); @@ -632,11 +649,7 @@ pub async fn run_all_for_model_async_for_lang(cfg: &BenchRunContext<'_>) -> Resu None } }; - if let Some(ref api) = cfg.api_client { - api.upload_batch(cfg.mode, &outcomes, analysis.as_deref())?; - } else { - eprintln!("[runner] no API client configured; skipping upload"); - } + upload_batch_for_context(cfg, &outcomes, analysis.as_deref()).await?; } else { eprintln!("[runner] no results; skipping upload"); } @@ -831,11 +844,7 @@ pub async fn run_selected_for_model_async_for_lang(cfg: &BenchRunContext<'_>) -> None } }; - if let Some(ref api) = cfg.api_client { - api.upload_batch(cfg.mode, &outcomes, analysis.as_deref())?; - } else { - eprintln!("[runner] no API client configured; skipping upload"); - } + upload_batch_for_context(cfg, &outcomes, analysis.as_deref()).await?; } println!( diff --git a/tools/xtask-llm-benchmark/src/bench/templates.rs b/tools/xtask-llm-benchmark/src/bench/templates.rs index b5fa5f6add3..35176de8200 100644 --- a/tools/xtask-llm-benchmark/src/bench/templates.rs +++ b/tools/xtask-llm-benchmark/src/bench/templates.rs @@ -159,20 +159,104 @@ fn inject_csharp(root: &Path, llm_code: &str) -> anyhow::Result<()> { } fs::write(&prog, contents).with_context(|| format!("write {}", prog.display()))?; - let base_rel = relative_to_workspace(root, "crates/bindings-csharp")?; let runtime_csproj = workspace_root().join("crates/bindings-csharp/Runtime/Runtime.csproj"); if !runtime_csproj.is_file() { bail!("local C# Runtime not found at {}", runtime_csproj.display()); } - let runtime_ref = format!("{}/Runtime/Runtime.csproj", base_rel); - let runtime_dir = format!("{}/Runtime", base_rel); - let codegen_ref = format!("{}/Codegen/Codegen.csproj", base_rel); + let runtime_version = read_csharp_package_version(&runtime_csproj)?; let csproj_path = root.join("StdbModule.csproj"); let mut csproj = fs::read_to_string(&csproj_path).with_context(|| format!("read {}", csproj_path.display()))?; - csproj = csproj.replace("{SPACETIME_CSHARP_RUNTIME_DIR}", &runtime_dir); - csproj = csproj.replace("{SPACETIME_CSHARP_RUNTIME_REF}", &runtime_ref); - csproj = csproj.replace("{SPACETIME_CSHARP_CODEGEN_REF}", &codegen_ref); + csproj = csproj.replace("{SPACETIME_CSHARP_RUNTIME_VERSION}", &runtime_version); fs::write(&csproj_path, csproj).with_context(|| format!("write {}", csproj_path.display()))?; + + write_csharp_nuget_config(root)?; + Ok(()) +} + +fn read_csharp_package_version(csproj_path: &Path) -> Result { + let contents = fs::read_to_string(csproj_path).with_context(|| format!("read {}", csproj_path.display()))?; + let version = contents + .split("") + .nth(1) + .and_then(|rest| rest.split("").next()) + .map(str::trim) + .filter(|version| !version.is_empty()) + .with_context(|| format!("missing in {}", csproj_path.display()))?; + Ok(version.to_owned()) +} + +fn normalize_nuget_path(path: &Path) -> String { + path.display() + .to_string() + .replace('\\', "/") + .trim_end_matches('/') + .to_string() +} + +fn ensure_csharp_package_source(path: &Path, package_id: &str) -> Result<()> { + let has_package = fs::read_dir(path).ok().into_iter().flatten().flatten().any(|entry| { + entry + .file_name() + .to_str() + .is_some_and(|name| name.starts_with(package_id) && name.ends_with(".nupkg")) + }); + if !has_package { + bail!( + "local C# package {} not found in {}. Run: dotnet pack -c Release crates/bindings-csharp/{}", + package_id, + path.display(), + package_id.strip_prefix("SpacetimeDB.").unwrap_or(package_id) + ); + } + Ok(()) +} + +fn write_csharp_nuget_config(root: &Path) -> Result<()> { + let workspace = workspace_root(); + let runtime_source = workspace.join("crates/bindings-csharp/Runtime/bin/Release"); + let bsatn_source = workspace.join("crates/bindings-csharp/BSATN.Runtime/bin/Release"); + + ensure_csharp_package_source(&runtime_source, "SpacetimeDB.Runtime")?; + ensure_csharp_package_source(&bsatn_source, "SpacetimeDB.BSATN.Runtime")?; + + let package_cache = root.join(".nuget/packages"); + if package_cache.exists() { + fs::remove_dir_all(&package_cache).with_context(|| format!("remove {}", package_cache.display()))?; + } + fs::create_dir_all(&package_cache).with_context(|| format!("create {}", package_cache.display()))?; + + let nuget_config = format!( + r#" + + + + + + + + + + + + + + + + + + + + + + +"#, + normalize_nuget_path(&package_cache), + normalize_nuget_path(&runtime_source), + normalize_nuget_path(&bsatn_source), + ); + + fs::write(root.join("nuget.config"), nuget_config) + .with_context(|| format!("write {}", root.join("nuget.config").display()))?; Ok(()) } diff --git a/tools/xtask-llm-benchmark/src/bench/types.rs b/tools/xtask-llm-benchmark/src/bench/types.rs index 930e3feac1c..e54df0d4902 100644 --- a/tools/xtask-llm-benchmark/src/bench/types.rs +++ b/tools/xtask-llm-benchmark/src/bench/types.rs @@ -209,4 +209,6 @@ pub struct RunConfig { pub local_analysis: bool, /// Shared identifier used to group dry-run artifacts pub dry_run_id: Option, + /// Website-provided route list used instead of static default_model_routes() + pub route_overrides: Option>, } diff --git a/tools/xtask-llm-benchmark/src/bench/utils.rs b/tools/xtask-llm-benchmark/src/bench/utils.rs index a8ccddc23e5..6e28315e4f6 100644 --- a/tools/xtask-llm-benchmark/src/bench/utils.rs +++ b/tools/xtask-llm-benchmark/src/bench/utils.rs @@ -109,13 +109,13 @@ pub fn bench_rust_concurrency() -> usize { .unwrap_or(2) } -/// Concurrency for C# builds. Lower default than Rust due to dotnet/WASI SDK -/// instability under high parallelism (causes SIGSEGV and "Pipe is broken" errors). +/// Concurrency for C# builds. Keep this serialized to match smoketest behavior; +/// dotnet/WASI SDK builds are fragile when multiple generated modules publish at once. pub fn bench_csharp_concurrency() -> usize { env::var("LLM_BENCH_CSHARP_CONCURRENCY") .ok() .and_then(|s| s.parse().ok()) - .unwrap_or(4) + .unwrap_or(1) } pub fn bench_route_concurrency() -> usize { diff --git a/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts index 26c7dc9b230..1ba8ca175d1 100644 --- a/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts +++ b/tools/xtask-llm-benchmark/src/benchmarks/queries/t_037_multi_column_filter/answers/typescript.ts @@ -2,7 +2,7 @@ import { schema, table, t } from 'spacetimedb/server'; const eventLog = table({ name: 'event_log', - indexes: [{ name: 'byCategorySeverity', algorithm: 'btree', columns: ['category', 'severity'] }], + indexes: [{ accessor: 'byCategorySeverity', algorithm: 'btree', columns: ['category', 'severity'] }], }, { id: t.u64().primaryKey().autoInc(), category: t.string(), diff --git a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts index 50d9f9c1dae..d23dead5a96 100644 --- a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts +++ b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_018_constraints/answers/typescript.ts @@ -2,7 +2,7 @@ import { table, schema, t } from 'spacetimedb/server'; const account = table({ name: 'account', - indexes: [{ name: 'byName', algorithm: 'btree', columns: ['name'] }], + indexes: [{ accessor: 'byName', algorithm: 'btree', columns: ['name'] }], }, { id: t.u64().primaryKey().autoInc(), email: t.string().unique(), diff --git a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts index d7629137dcc..4ab152504d1 100644 --- a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts +++ b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_019_many_to_many/answers/typescript.ts @@ -24,8 +24,8 @@ const membership = table( { name: 'membership', indexes: [ - { name: 'byUser', algorithm: 'btree', columns: ['userId'] }, - { name: 'byGroup', algorithm: 'btree', columns: ['groupId'] }, + { accessor: 'byUser', algorithm: 'btree', columns: ['userId'] }, + { accessor: 'byGroup', algorithm: 'btree', columns: ['groupId'] }, ], }, { diff --git a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts index 5d5fb568d7b..2f237fb0151 100644 --- a/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts +++ b/tools/xtask-llm-benchmark/src/benchmarks/schema/t_021_multi_column_index/answers/typescript.ts @@ -2,7 +2,7 @@ import { table, schema, t } from 'spacetimedb/server'; const log = table({ name: 'log', - indexes: [{ name: 'byUserDay', algorithm: 'btree', columns: ['userId', 'day'] }], + indexes: [{ accessor: 'byUserDay', algorithm: 'btree', columns: ['userId', 'day'] }], }, { id: t.u64().primaryKey().autoInc(), userId: t.i32(), diff --git a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs index c624fdc4108..6ec030a49e8 100644 --- a/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs +++ b/tools/xtask-llm-benchmark/src/bin/llm_benchmark.rs @@ -1,7 +1,7 @@ #![allow(clippy::disallowed_macros, clippy::type_complexity, clippy::enum_variant_names)] use anyhow::{Context, Result}; -use clap::{Args, Parser, Subcommand}; +use clap::{Args, Parser, Subcommand, ValueEnum}; use futures::{StreamExt, TryStreamExt}; use spacetimedb_data_structures::map::{HashCollectionExt as _, HashMap, HashSet}; use spacetimedb_guard::SpacetimeDbGuard; @@ -71,6 +71,12 @@ struct Cli { command: Commands, } +#[derive(Clone, Copy, Debug, Eq, PartialEq, ValueEnum)] +enum ModelSource { + Static, + Remote, +} + #[derive(Subcommand, Debug)] enum Commands { /// Run benchmarks / build goldens / compute hashes. @@ -124,6 +130,10 @@ struct RunArgs { #[arg(long, num_args = 1..)] models: Option>, + /// Where to resolve models when --models is not provided + #[arg(long, value_enum, default_value_t = ModelSource::Static)] + model_source: ModelSource, + /// Run benchmarks without uploading results #[arg(long)] dry_run: bool, @@ -131,6 +141,9 @@ struct RunArgs { /// When used with --dry-run, also generate local markdown analysis files #[arg(long, requires = "dry_run")] local_analysis: bool, + + #[arg(skip)] + route_overrides: Option>, } #[derive(Args, Debug, Clone)] @@ -218,14 +231,17 @@ fn run_benchmarks(args: RunArgs) -> Result<()> { let dry_run = args.dry_run; let local_analysis = args.local_analysis; let dry_run_id = dry_run.then(|| chrono::Utc::now().format("%Y-%m-%d_%H%M%S").to_string()); + let should_fetch_remote_routes = should_fetch_remote_routes(&args); - let api_client = if dry_run { - None - } else { + let needs_api_client = should_fetch_remote_routes || !dry_run; + let api_client = if needs_api_client { ApiClient::from_env().context("failed to initialize API client")? + } else { + None }; + let upload_client = if dry_run { None } else { api_client.clone() }; - if api_client.is_none() && !dry_run { + if upload_client.is_none() && !dry_run { eprintln!("[warn] LLM_BENCHMARK_UPLOAD_URL not set; results will not be uploaded"); } @@ -240,31 +256,30 @@ fn run_benchmarks(args: RunArgs) -> Result<()> { categories: categories_to_set(args.categories), model_filter: model_filter_from_groups(args.models), host: None, - api_client: api_client.clone(), + api_client: upload_client.clone(), dry_run, local_analysis, dry_run_id: dry_run_id.clone(), + route_overrides: args.route_overrides, }; + if should_fetch_remote_routes { + let api = api_client + .as_ref() + .context("LLM_BENCHMARK_UPLOAD_URL required when --model-source remote is used")?; + config.route_overrides = Some(api.fetch_model_routes()?); + } + let bench_root = find_bench_root(); // Upload task catalog before running benchmarks - if let Some(ref api) = api_client + if let Some(ref api) = upload_client && let Err(e) = api.upload_task_catalog(&bench_root) { eprintln!("[warn] failed to upload task catalog: {e}"); } - let modes = config - .modes - .clone() - .unwrap_or_else(|| ALL_MODES.iter().map(|s| s.to_string()).collect()); - - let RuntimeInit { - runtime, - provider: llm_provider, - guard, - } = initialize_runtime_and_provider(config.hash_only, config.goldens_only)?; + let RuntimeInit { runtime, guard } = initialize_runtime(config.hash_only)?; config.host = guard.as_ref().map(|g| g.host_url.clone()); @@ -273,7 +288,24 @@ fn run_benchmarks(args: RunArgs) -> Result<()> { let selectors: Option> = config.selectors.clone(); let selectors_ref: Option<&[String]> = selectors.as_deref(); - if !config.goldens_only && !config.hash_only { + let modes = config + .modes + .clone() + .unwrap_or_else(|| ALL_MODES.iter().map(|s| s.to_string()).collect()); + + if config.goldens_only { + let rt = runtime.as_ref().expect("runtime required for --goldens-only"); + rt.block_on(build_goldens_only_for_lang( + config.host.clone(), + &bench_root, + config.lang, + selectors_ref, + ))?; + println!("[{}] goldens-only build complete", config.lang.as_str()); + return Ok(()); + } + + let llm_provider = if !config.goldens_only && !config.hash_only { let rt = runtime.as_ref().expect("failed to initialize runtime for goldens"); rt.block_on(ensure_goldens_built_once( config.host.clone(), @@ -281,7 +313,15 @@ fn run_benchmarks(args: RunArgs) -> Result<()> { config.lang, selectors_ref, ))?; - } + + let provider = make_provider_from_env()?; + let rt = runtime.as_ref().expect("failed to initialize runtime for preflight"); + let routes = filter_routes(&config); + preflight_llm_routes(rt, provider.as_ref(), &routes, &modes)?; + Some(provider) + } else { + None + }; let mut all_outcomes: Vec = Vec::new(); @@ -379,10 +419,10 @@ fn cmd_analyze(args: AnalyzeArgs) -> Result<()> { let provider = make_provider_from_env()?; let analysis_route = ModelRoute::new( - "gpt-4.1-mini", + "gpt-5.4-mini", xtask_llm_benchmark::llm::types::Vendor::OpenAi, - "gpt-4.1-mini", - Some("openai/gpt-4.1-mini"), + "gpt-5.4-mini", + Some("openai/gpt-5.4-mini"), ); for ((lang, mode, model), group_failures) in &groups { @@ -517,6 +557,59 @@ fn short_hash(s: &str) -> &str { &s[..s.len().min(12)] } +fn should_fetch_remote_routes(args: &RunArgs) -> bool { + args.model_source == ModelSource::Remote + && args.models.is_none() + && args.route_overrides.is_none() + && !args.hash_only + && !args.goldens_only +} + +fn preflight_llm_routes( + runtime: &Runtime, + llm_provider: &dyn LlmProvider, + routes: &[ModelRoute], + modes: &[String], +) -> Result<()> { + if routes.is_empty() { + return Ok(()); + } + + let mut search_flags = Vec::new(); + if modes.iter().any(|mode| mode == "search") { + search_flags.push(true); + } + if modes.iter().any(|mode| mode != "search") { + search_flags.push(false); + } + + let mut failures = Vec::new(); + for route in routes { + for search_enabled in &search_flags { + let mode_label = if *search_enabled { + "search/OpenRouter online" + } else { + "standard" + }; + + if let Err(err) = runtime.block_on(llm_provider.preflight_route(route, *search_enabled)) { + let msg = format!("{} ({mode_label}): {err:#}", route.display_name); + eprintln!("[preflight] FAILED {msg}"); + failures.push(msg); + } + } + } + + if !failures.is_empty() { + anyhow::bail!( + "LLM provider preflight failed before benchmark run:\n - {}", + failures.join("\n - ") + ); + } + + Ok(()) +} + /// Run benchmarks for a single mode. fn run_mode_benchmarks( mode: &str, @@ -538,15 +631,6 @@ fn run_mode_benchmarks( return Ok(Vec::new()); } - if config.goldens_only { - let rt = runtime.expect("runtime required for --goldens-only"); - let sels = config.selectors.as_deref(); - - rt.block_on(build_goldens_only_for_lang(config.host.clone(), bench_root, lang, sels))?; - println!("{:<12} [{:<10}] goldens-only build complete", mode, lang_str); - return Ok(Vec::new()); - } - // Run benchmarks for all matching routes let routes = filter_routes(config); @@ -598,7 +682,12 @@ fn run_mode_benchmarks( /// When explicit `openrouter:vendor/model` entries are passed they won't appear in /// `default_model_routes`, so we synthesize ad-hoc routes for them here. fn filter_routes(config: &RunConfig) -> Vec { - let mut routes: Vec = default_model_routes() + let base_routes: Vec = config + .route_overrides + .clone() + .unwrap_or_else(|| default_model_routes().to_vec()); + + let mut routes: Vec = base_routes .iter() .filter(|r| config.providers_filter.as_ref().is_none_or(|f| f.contains(&r.vendor))) .filter(|r| match &config.model_filter { @@ -627,6 +716,7 @@ fn filter_routes(config: &RunConfig) -> Vec { let already_matched = routes.iter().any(|r| { r.vendor == *vendor && (r.api_model == model_id.as_str() + || r.display_name.to_ascii_lowercase() == model_id.as_str() || r.openrouter_model.as_deref() == Some(model_id.as_str())) }); if !already_matched { @@ -708,15 +798,13 @@ fn categories_to_set(v: Option>) -> Option> { pub struct RuntimeInit { pub runtime: Option, - pub provider: Option>, pub guard: Option, } -fn initialize_runtime_and_provider(hash_only: bool, goldens_only: bool) -> Result { +fn initialize_runtime(hash_only: bool) -> Result { if hash_only { return Ok(RuntimeInit { runtime: None, - provider: None, guard: None, }); } @@ -726,18 +814,8 @@ fn initialize_runtime_and_provider(hash_only: bool, goldens_only: bool) -> Resul let runtime = tokio::runtime::Builder::new_multi_thread().enable_all().build()?; - if goldens_only { - return Ok(RuntimeInit { - runtime: Some(runtime), - provider: None, - guard: Some(spacetime), - }); - } - - let llm_provider = make_provider_from_env()?; Ok(RuntimeInit { runtime: Some(runtime), - provider: Some(llm_provider), guard: Some(spacetime), }) } @@ -753,8 +831,8 @@ fn find_bench_root() -> PathBuf { start.join("src").join("benchmarks") } -fn collect_task_numbers_in_categories(bench_root: &Path, cats: &HashSet) -> Result> { - let mut nums = HashSet::new(); +fn collect_task_names_in_categories(bench_root: &Path, cats: &HashSet) -> Result> { + let mut tasks = HashSet::new(); for c in cats { let dir = bench_root.join(c); if !dir.is_dir() { @@ -765,24 +843,38 @@ fn collect_task_numbers_in_categories(bench_root: &Path, cats: &HashSet) if !entry.file_type()?.is_dir() { continue; } - let name = entry.file_name().to_string_lossy().into_owned(); - if let Some(rest) = name.strip_prefix("t_") - && let Some((num_str, _)) = rest.split_once('_') - && num_str.len() == 3 - && let Ok(n) = num_str.parse::() - { - nums.insert(n); - } + tasks.insert(entry.file_name().to_string_lossy().to_ascii_lowercase()); } } - Ok(nums) + Ok(tasks) } -fn normalize_numeric_selectors(raw: &[String]) -> Vec { - raw.iter() - .filter(|s| !s.is_empty() && s.chars().all(|c| c.is_ascii_digit())) - .filter_map(|s| s.parse::().ok()) - .collect() +fn task_selector_matches_any(selector: &str, allowed_tasks: &HashSet) -> bool { + allowed_tasks.iter().any(|task| task.starts_with(selector)) +} + +fn normalize_task_filter_selector(raw: &str) -> Result { + let s = raw.trim().to_ascii_lowercase(); + if s.is_empty() { + anyhow::bail!("empty task selector"); + } + if let Some(rest) = s.strip_prefix("t_") { + if rest.chars().all(|c| c.is_ascii_digit()) { + let n: u32 = rest.parse()?; + return Ok(format!("t_{:03}", n)); + } + if rest.chars().next().is_some_and(|c| c.is_ascii_digit()) + && rest.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') + { + return Ok(s); + } + anyhow::bail!("invalid task selector: {raw}"); + } + if s.chars().all(|c| c.is_ascii_digit()) { + let n: u32 = s.parse()?; + return Ok(format!("t_{:03}", n)); + } + anyhow::bail!("invalid task selector: {raw}") } fn apply_category_filter( @@ -796,23 +888,148 @@ fn apply_category_filter( Ok(selectors.map(|s| s.to_vec())) } Some(cats) => { - let allowed = collect_task_numbers_in_categories(bench_root, cats)?; - let out_nums: Vec = match selectors { + let allowed = collect_task_names_in_categories(bench_root, cats)?; + let mut out: Vec = match selectors { Some(user) => { - let nums = normalize_numeric_selectors(user); - nums.into_iter().filter(|n| allowed.contains(n)).collect() + let mut selected = Vec::new(); + for selector in user { + let normalized = normalize_task_filter_selector(selector)?; + if task_selector_matches_any(&normalized, &allowed) { + selected.push(normalized); + } + } + selected } None => { - let mut v: Vec = allowed.into_iter().collect(); + let mut v: Vec = allowed.into_iter().collect(); v.sort_unstable(); v } }; - if out_nums.is_empty() { - Ok(None) - } else { - Ok(Some(out_nums.into_iter().map(|n| n.to_string()).collect())) + out.sort(); + out.dedup(); + if out.is_empty() { + anyhow::bail!("no tasks matched category/task filters"); } + Ok(Some(out)) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn base_run_args() -> RunArgs { + RunArgs { + modes: None, + lang: Lang::Rust, + hash_only: false, + goldens_only: false, + force: false, + categories: None, + tasks: None, + providers: None, + models: None, + model_source: ModelSource::Static, + dry_run: false, + local_analysis: false, + route_overrides: None, + } + } + + fn base_config(route_overrides: Option>) -> RunConfig { + RunConfig { + modes: None, + hash_only: false, + goldens_only: false, + lang: Lang::Rust, + providers_filter: None, + selectors: None, + force: false, + categories: None, + model_filter: None, + host: None, + api_client: None, + dry_run: false, + local_analysis: false, + dry_run_id: None, + route_overrides, } } + + #[test] + fn explicit_models_bypass_remote_model_source() { + let mut args = base_run_args(); + args.model_source = ModelSource::Remote; + assert!(should_fetch_remote_routes(&args)); + + args.models = Some(vec![ModelGroup { + vendor: Vendor::OpenAi, + models: vec!["gpt-test".to_string()], + }]); + assert!(!should_fetch_remote_routes(&args)); + + args.dry_run = true; + assert!(!should_fetch_remote_routes(&args)); + } + + #[test] + fn filter_routes_uses_remote_route_override() { + let remote_route = ModelRoute::new( + "Remote Model", + Vendor::OpenRouter, + "openai/remote-model", + Some("openai/remote-model"), + ); + let config = base_config(Some(vec![remote_route])); + + let routes = filter_routes(&config); + assert_eq!(routes.len(), 1); + assert_eq!(routes[0].display_name, "Remote Model"); + assert_eq!(routes[0].api_model, "openai/remote-model"); + } + + #[test] + fn filter_routes_does_not_synthesize_duplicate_for_display_name_match() { + let remote_route = ModelRoute::new( + "DeepSeek V4 Flash", + Vendor::DeepSeek, + "deepseek-v4-flash", + Some("deepseek/deepseek-v4-flash"), + ); + let mut config = base_config(Some(vec![remote_route])); + let mut allowed = HashSet::new(); + allowed.insert("deepseek v4 flash".to_string()); + let mut filter = HashMap::new(); + filter.insert(Vendor::DeepSeek, allowed); + config.model_filter = Some(filter); + + let routes = filter_routes(&config); + assert_eq!(routes.len(), 1); + assert_eq!(routes[0].display_name, "DeepSeek V4 Flash"); + assert_eq!(routes[0].api_model, "deepseek-v4-flash"); + } + + #[test] + fn category_filter_accepts_full_task_ids() { + let root = std::env::temp_dir().join(format!( + "llm-benchmark-test-{}", + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + )); + fs::create_dir_all(root.join("basics").join("t_001_basic_tables")).unwrap(); + fs::create_dir_all(root.join("schema").join("t_012_product_type")).unwrap(); + + let mut categories = HashSet::new(); + categories.insert("basics".to_string()); + let selectors = vec!["t_001_basic_tables".to_string(), "t_012_product_type".to_string()]; + + let filtered = apply_category_filter(&root, Some(&categories), Some(&selectors)).unwrap(); + fs::remove_dir_all(&root).unwrap(); + + assert_eq!(filtered, Some(vec!["t_001_basic_tables".to_string()])); + } } diff --git a/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs b/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs index c7a057c4638..8bb0d1ac734 100644 --- a/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs +++ b/tools/xtask-llm-benchmark/src/llm/clients/anthropic.rs @@ -237,6 +237,12 @@ fn anthropic_max_output_tokens() -> u32 { pub fn normalize_anthropic_model(id: &str) -> &str { let lid = id.to_ascii_lowercase().replace('_', "-"); match lid.as_str() { + // Opus 4.8 + "opus-4.8" | "claude-opus-4.8" | "claude-opus-4-8" => "claude-opus-4-8", + + // Sonnet 4.6 + "sonnet-4.6" | "claude-sonnet-4.6" | "claude-sonnet-4-6" => "claude-sonnet-4-6", + // Sonnet 4.5 "sonnet-4.5" | "claude-sonnet-4.5" | "claude-sonnet-4-5" => "claude-sonnet-4-5", "claude-sonnet-4-5-20250929" => "claude-sonnet-4-5-20250929", diff --git a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs index 172beef8ff8..254fe5b8f63 100644 --- a/tools/xtask-llm-benchmark/src/llm/clients/mod.rs +++ b/tools/xtask-llm-benchmark/src/llm/clients/mod.rs @@ -8,6 +8,9 @@ pub mod openai; pub mod openrouter; pub mod xai; +use anyhow::Result; +use async_trait::async_trait; + pub use anthropic::AnthropicClient; pub use deepseek::DeepSeekClient; pub use google::GoogleGeminiClient; @@ -15,3 +18,76 @@ pub use meta::MetaLlamaClient; pub use openai::OpenAiClient; pub use openrouter::OpenRouterClient; pub use xai::XaiGrokClient; + +use crate::llm::prompt::BuiltPrompt; +use crate::llm::types::LlmOutput; + +#[derive(Debug, Clone)] +pub struct ClientPreflight { + summary: String, +} + +impl ClientPreflight { + pub fn new(summary: impl Into) -> Self { + Self { + summary: summary.into(), + } + } + + pub fn summary(&self) -> &str { + &self.summary + } +} + +#[async_trait] +pub trait LlmClient: Send + Sync { + fn provider_name(&self) -> &'static str; + + async fn preflight(&self, model: &str) -> Result { + Ok(ClientPreflight::new(format!( + "{} credit preflight not implemented for model '{}'; skipped", + self.provider_name(), + model + ))) + } + + async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result; +} + +macro_rules! impl_direct_llm_client { + ($ty:ty, $provider_name:literal) => { + #[async_trait] + impl LlmClient for $ty { + fn provider_name(&self) -> &'static str { + $provider_name + } + + async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result { + <$ty>::generate(self, model, prompt).await + } + } + }; +} + +impl_direct_llm_client!(OpenAiClient, "OpenAI"); +impl_direct_llm_client!(AnthropicClient, "Anthropic"); +impl_direct_llm_client!(GoogleGeminiClient, "Google"); +impl_direct_llm_client!(XaiGrokClient, "xAI"); +impl_direct_llm_client!(DeepSeekClient, "DeepSeek"); +impl_direct_llm_client!(MetaLlamaClient, "Meta"); + +#[async_trait] +impl LlmClient for OpenRouterClient { + fn provider_name(&self) -> &'static str { + "OpenRouter" + } + + async fn preflight(&self, model: &str) -> Result { + let status = self.preflight_credits(model).await?; + Ok(ClientPreflight::new(status.summary())) + } + + async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result { + OpenRouterClient::generate(self, model, prompt).await + } +} diff --git a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs index 623570298af..8e8642ada0b 100644 --- a/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs +++ b/tools/xtask-llm-benchmark/src/llm/clients/openrouter.rs @@ -1,5 +1,6 @@ -use anyhow::{anyhow, Context, Result}; -use serde::Serialize; +use anyhow::{anyhow, bail, Context, Result}; +use serde::{Deserialize, Serialize}; +use std::env; use super::http::HttpClient; use super::oa_compat::OACompatResp; @@ -35,6 +36,132 @@ impl OpenRouterClient { Self { base, api_key, http } } + pub async fn preflight_credits(&self, model: &str) -> Result { + let key_info = self.fetch_key_info().await?; + let min_credits = min_credits_threshold(); + let mut unchecked_allowed = false; + let mut model_probe = None; + + if let Some(remaining) = key_info.limit_remaining + && remaining <= min_credits + { + bail!( + "OpenRouter API key has insufficient remaining credits: {:.4} <= {:.4}", + remaining, + min_credits + ); + } + + let account = match env::var("OPENROUTER_MANAGEMENT_API_KEY") + .ok() + .filter(|v| !v.trim().is_empty()) + { + Some(key) => Some(self.fetch_account_credits(&key).await?), + None => None, + }; + + if let Some(account) = &account + && account.remaining <= min_credits + { + bail!( + "OpenRouter account has insufficient remaining credits: {:.4} <= {:.4}", + account.remaining, + min_credits + ); + } + + if account.is_none() && key_info.limit_remaining.is_none() { + if allow_unchecked_credits() { + unchecked_allowed = true; + } else { + self.probe_model(model).await?; + model_probe = Some(model.to_string()); + } + } + + Ok(OpenRouterCreditStatus { + key_limit: key_info.limit, + key_limit_remaining: key_info.limit_remaining, + account_remaining: account.map(|a| a.remaining), + min_credits, + model_probe, + unchecked_allowed, + }) + } + + async fn fetch_key_info(&self) -> Result { + let url = format!("{}/key", self.base.trim_end_matches('/')); + let auth = HttpClient::bearer(&self.api_key); + let body = self + .http + .get_text(&url, &[auth]) + .await + .with_context(|| format!("OpenRouter key preflight GET {}", url))?; + + let resp: OpenRouterKeyResp = serde_json::from_str(&body).context("parse OpenRouter key response")?; + Ok(resp.data) + } + + async fn fetch_account_credits(&self, management_key: &str) -> Result { + let url = format!("{}/credits", self.base.trim_end_matches('/')); + let auth = HttpClient::bearer(management_key); + let body = self + .http + .get_text(&url, &[auth]) + .await + .with_context(|| format!("OpenRouter account credit preflight GET {}", url))?; + + let resp: OpenRouterCreditsResp = serde_json::from_str(&body).context("parse OpenRouter credits response")?; + Ok(OpenRouterAccountCredits { + remaining: resp.data.total_credits - resp.data.total_usage, + }) + } + + async fn probe_model(&self, model: &str) -> Result<()> { + let url = format!("{}/chat/completions", self.base.trim_end_matches('/')); + + #[derive(Serialize)] + struct Req<'a> { + model: &'a str, + messages: [Msg<'a>; 1], + temperature: f32, + max_tokens: u32, + } + + #[derive(Serialize)] + struct Msg<'a> { + role: &'a str, + content: &'a str, + } + + let req = Req { + model, + messages: [Msg { + role: "user", + content: "ping", + }], + temperature: 0.0, + max_tokens: 16, + }; + let auth = HttpClient::bearer(&self.api_key); + let body = self + .http + .post_json(&url, &[auth], &req) + .await + .with_context(|| format!("OpenRouter model probe failed for '{model}'"))?; + + let resp: serde_json::Value = serde_json::from_str(&body).context("parse OpenRouter probe response")?; + if let Some(err) = resp.get("error") { + let message = err + .get("message") + .and_then(|message| message.as_str()) + .unwrap_or("unknown OpenRouter probe error"); + bail!("OpenRouter model probe failed for '{}': {}", model, message); + } + + Ok(()) + } + pub async fn generate(&self, model: &str, prompt: &BuiltPrompt) -> Result { let url = format!("{}/chat/completions", self.base.trim_end_matches('/')); @@ -125,6 +252,100 @@ impl OpenRouterClient { } } +#[derive(Debug, Clone)] +pub struct OpenRouterCreditStatus { + pub key_limit: Option, + pub key_limit_remaining: Option, + pub account_remaining: Option, + pub min_credits: f64, + pub model_probe: Option, + pub unchecked_allowed: bool, +} + +impl OpenRouterCreditStatus { + pub fn summary(&self) -> String { + let key_remaining = match (self.key_limit, self.key_limit_remaining) { + (Some(limit), Some(remaining)) => format!("key remaining {remaining:.4}/{limit:.4}"), + (Some(limit), None) => format!("key limit {limit:.4}, remaining unknown"), + (None, Some(remaining)) => format!("key remaining {remaining:.4}"), + (None, None) => "key has no configured limit".to_string(), + }; + + let credit_status = match self.account_remaining { + Some(remaining) => { + format!( + "{key_remaining}; account remaining {remaining:.4}; min {:.4}", + self.min_credits + ) + } + None => format!( + "{key_remaining}; account balance not checked (set OPENROUTER_MANAGEMENT_API_KEY); min {:.4}", + self.min_credits + ), + }; + + if let Some(model) = &self.model_probe { + format!("{credit_status}; model probe OK for '{model}'") + } else if self.unchecked_allowed { + format!("{credit_status}; unchecked credits allowed by OPENROUTER_ALLOW_UNCHECKED_CREDITS") + } else { + credit_status + } + } +} + +#[derive(Debug, Deserialize)] +struct OpenRouterKeyResp { + data: OpenRouterKeyInfo, +} + +#[derive(Debug, Deserialize)] +struct OpenRouterKeyInfo { + limit: Option, + limit_remaining: Option, +} + +#[derive(Debug, Deserialize)] +struct OpenRouterCreditsResp { + data: OpenRouterCreditsData, +} + +#[derive(Debug, Deserialize)] +struct OpenRouterCreditsData { + total_credits: f64, + total_usage: f64, +} + +#[derive(Debug, Clone)] +struct OpenRouterAccountCredits { + remaining: f64, +} + +fn min_credits_threshold() -> f64 { + let openrouter = env::var("OPENROUTER_MIN_CREDITS").ok(); + let global = env::var("LLM_MIN_CREDITS").ok(); + parse_min_credits_threshold(openrouter.as_deref(), global.as_deref()) +} + +fn allow_unchecked_credits() -> bool { + let value = env::var("OPENROUTER_ALLOW_UNCHECKED_CREDITS").ok(); + parse_env_flag(value.as_deref()) +} + +fn parse_min_credits_threshold(openrouter: Option<&str>, global: Option<&str>) -> f64 { + [openrouter, global] + .into_iter() + .flatten() + .find_map(|v| v.trim().parse::().ok()) + .unwrap_or(0.0) +} + +fn parse_env_flag(value: Option<&str>) -> bool { + value + .map(|v| matches!(v.trim().to_ascii_lowercase().as_str(), "1" | "true" | "yes" | "y")) + .unwrap_or(false) +} + /// Context limits for models accessed via OpenRouter. /// Uses the same limits as direct clients where known, /// falls back to a conservative default. @@ -133,26 +354,44 @@ pub fn openrouter_ctx_limit_tokens(model: &str) -> usize { // Anthropic if m.contains("claude") { + if m.contains("4.6") + || m.contains("4-6") + || m.contains("4.7") + || m.contains("4-7") + || m.contains("4.8") + || m.contains("4-8") + { + return 1_000_000; + } return 185_000; } // OpenAI + if m.contains("gpt-5.5") { + return 1_050_000; + } if m.contains("gpt-5") || m.contains("gpt-4.1") { return 400_000; } if m.contains("gpt-4o") || m.contains("gpt-4") { return 128_000; } - // xAI / Grok — leave ~50 k headroom for segments + output on top of trimmed prefix - if m.contains("grok-code-fast") { + // xAI / Grok + if m.contains("grok-build-0.1") || m.contains("grok-code-fast") { return 200_000; } + if m.contains("grok-4.3") { + return 1_000_000; + } if m.contains("grok-4") { return 200_000; } if m.contains("grok") { return 90_000; } - // DeepSeek — hard cap is 131 072 on OpenRouter; leave ~25 k headroom for segments + output + // DeepSeek + if m.contains("deepseek-v4") { + return 1_000_000; + } if m.contains("deepseek") { return 106_000; } @@ -173,3 +412,26 @@ pub fn openrouter_ctx_limit_tokens(model: &str) -> usize { DEFAULT_CTX_LIMIT } + +#[cfg(test)] +mod tests { + use super::{parse_env_flag, parse_min_credits_threshold}; + + #[test] + fn openrouter_min_credits_overrides_global_threshold() { + assert_eq!(parse_min_credits_threshold(Some("2.5"), Some("1.0")), 2.5); + assert_eq!(parse_min_credits_threshold(None, Some("1.0")), 1.0); + assert_eq!(parse_min_credits_threshold(Some("not-a-number"), Some("1.0")), 1.0); + assert_eq!(parse_min_credits_threshold(None, None), 0.0); + } + + #[test] + fn unchecked_credit_escape_hatch_accepts_common_true_values() { + for value in ["1", "true", "TRUE", " yes ", "y"] { + assert!(parse_env_flag(Some(value))); + } + for value in [None, Some(""), Some("0"), Some("false"), Some("no")] { + assert!(!parse_env_flag(value)); + } + } +} diff --git a/tools/xtask-llm-benchmark/src/llm/model_routes.rs b/tools/xtask-llm-benchmark/src/llm/model_routes.rs index e136976adb6..7f7ae93b66c 100644 --- a/tools/xtask-llm-benchmark/src/llm/model_routes.rs +++ b/tools/xtask-llm-benchmark/src/llm/model_routes.rs @@ -13,16 +13,16 @@ pub struct ModelRoute { static DEFAULT_ROUTES: LazyLock> = LazyLock::new(|| { use Vendor::*; vec![ - // OpenAI: Best GPT-5.2-Codex, Cheaper GPT-5-mini - ModelRoute::new("GPT-5.2-Codex", OpenAi, "gpt-5.2-codex", Some("openai/gpt-5.2-codex")), - ModelRoute::new("GPT-5-mini", OpenAi, "gpt-5-mini", Some("openai/gpt-5-mini")), - // Claude: Best Opus 4.6, Cheaper Sonnet 4.6 - // Direct API uses dashes (claude-opus-4-6); OpenRouter uses dots (claude-opus-4.6) + // OpenAI: Best GPT-5.5, Cheaper GPT-5.4-mini + ModelRoute::new("GPT-5.5", OpenAi, "gpt-5.5", Some("openai/gpt-5.5")), + ModelRoute::new("GPT-5.4-mini", OpenAi, "gpt-5.4-mini", Some("openai/gpt-5.4-mini")), + // Claude: Best Opus 4.8, Cheaper Sonnet 4.6 + // Direct API uses dashes (claude-opus-4-8); OpenRouter uses dots (claude-opus-4.8) ModelRoute::new( - "Claude Opus 4.6", + "Claude Opus 4.8", Anthropic, - "claude-opus-4-6", - Some("anthropic/claude-opus-4.6"), + "claude-opus-4-8", + Some("anthropic/claude-opus-4.8"), ), ModelRoute::new( "Claude Sonnet 4.6", @@ -30,9 +30,9 @@ static DEFAULT_ROUTES: LazyLock> = LazyLock::new(|| { "claude-sonnet-4-6", Some("anthropic/claude-sonnet-4.6"), ), - // Grok: Best Grok 4, Cheaper Grok Code - ModelRoute::new("Grok 4", Xai, "grok-4", Some("x-ai/grok-4.20-beta")), - ModelRoute::new("Grok Code", Xai, "grok-code-fast-1", Some("x-ai/grok-code-fast-1")), + // Grok: Best Grok 4.3, coding-specialized Grok Build + ModelRoute::new("Grok 4.3", Xai, "grok-4.3", Some("x-ai/grok-4.3")), + ModelRoute::new("Grok Build 0.1", Xai, "grok-build-0.1", Some("x-ai/grok-build-0.1")), // Gemini: direct via GOOGLE_API_KEY, falls back to OpenRouter if not set ModelRoute::new( "Gemini 3.1 Pro", @@ -41,24 +41,23 @@ static DEFAULT_ROUTES: LazyLock> = LazyLock::new(|| { Some("google/gemini-3.1-pro-preview"), ), ModelRoute::new( - "Gemini 3 Flash", + "Gemini 3.5 Flash", Google, - "gemini-3-flash-preview", - Some("google/gemini-3-flash-preview"), + "gemini-3.5-flash", + Some("google/gemini-3.5-flash"), ), - // DeepSeek: Reasoner (thinking), Chat (general) - // deepseek-reasoner is listed as deepseek-r1 on OpenRouter + // DeepSeek: Pro (highest capability), Flash (cheaper/faster) ModelRoute::new( - "DeepSeek Reasoner", + "DeepSeek V4 Pro", DeepSeek, - "deepseek-reasoner", - Some("deepseek/deepseek-r1"), + "deepseek-v4-pro", + Some("deepseek/deepseek-v4-pro"), ), ModelRoute::new( - "DeepSeek Chat", + "DeepSeek V4 Flash", DeepSeek, - "deepseek-chat", - Some("deepseek/deepseek-chat"), + "deepseek-v4-flash", + Some("deepseek/deepseek-v4-flash"), ), ] }); diff --git a/tools/xtask-llm-benchmark/src/llm/provider.rs b/tools/xtask-llm-benchmark/src/llm/provider.rs index 65d587d9526..355f2e19a3e 100644 --- a/tools/xtask-llm-benchmark/src/llm/provider.rs +++ b/tools/xtask-llm-benchmark/src/llm/provider.rs @@ -1,8 +1,10 @@ use anyhow::{Context, Result}; use async_trait::async_trait; +use std::collections::HashMap; use crate::llm::clients::{ - AnthropicClient, DeepSeekClient, GoogleGeminiClient, MetaLlamaClient, OpenAiClient, OpenRouterClient, XaiGrokClient, + AnthropicClient, DeepSeekClient, GoogleGeminiClient, LlmClient, MetaLlamaClient, OpenAiClient, OpenRouterClient, + XaiGrokClient, }; use crate::llm::model_routes::ModelRoute; use crate::llm::prompt::BuiltPrompt; @@ -10,19 +12,12 @@ use crate::llm::types::{LlmOutput, Vendor}; #[async_trait] pub trait LlmProvider: Send + Sync { + async fn preflight_route(&self, route: &ModelRoute, search_enabled: bool) -> Result<()>; async fn generate(&self, route: &ModelRoute, prompt: &BuiltPrompt) -> Result; } pub struct RouterProvider { - pub openai: Option, - pub anthropic: Option, - pub google: Option, - pub xai: Option, - pub deepseek: Option, - pub meta: Option, - /// OpenRouter client used as a unified fallback when a direct vendor client - /// is not configured. Set via `OPENROUTER_API_KEY`. - pub openrouter: Option, + clients: HashMap>, pub force: Option, } @@ -38,111 +33,145 @@ impl RouterProvider { openrouter: Option, force: Option, ) -> Self { - Self { - openai, - anthropic, - google, - xai, - deepseek, - meta, - openrouter, - force, + let mut clients: HashMap> = HashMap::new(); + + if let Some(client) = openai { + clients.insert(Vendor::OpenAi, Box::new(client)); + } + if let Some(client) = anthropic { + clients.insert(Vendor::Anthropic, Box::new(client)); + } + if let Some(client) = google { + clients.insert(Vendor::Google, Box::new(client)); + } + if let Some(client) = xai { + clients.insert(Vendor::Xai, Box::new(client)); + } + if let Some(client) = deepseek { + clients.insert(Vendor::DeepSeek, Box::new(client)); } + if let Some(client) = meta { + clients.insert(Vendor::Meta, Box::new(client)); + } + if let Some(client) = openrouter { + clients.insert(Vendor::OpenRouter, Box::new(client)); + } + + Self { clients, force } } } +struct ResolvedClient<'a> { + client: &'a dyn LlmClient, + endpoint_name: &'static str, + model: String, + fallback_from: Option<&'static str>, + search_enabled: bool, +} + #[async_trait] impl LlmProvider for RouterProvider { + async fn preflight_route(&self, route: &ModelRoute, search_enabled: bool) -> Result<()> { + let resolved = self.resolve_client(route, search_enabled)?; + let status = resolved.client.preflight(&resolved.model).await.with_context(|| { + format!( + "{} credit preflight failed for model '{}'", + resolved.endpoint_name, resolved.model + ) + })?; + + eprintln!( + "[preflight] {} -> {} '{}' OK ({})", + route.display_name, + resolved.endpoint_name, + resolved.model, + status.summary() + ); + Ok(()) + } + async fn generate(&self, route: &ModelRoute, prompt: &BuiltPrompt) -> Result { - // Web search mode: route all models through OpenRouter with :online suffix. - // OpenRouter's :online feature adds Bing-powered web search to any model. - if prompt.search_enabled { - let cli = self.openrouter.as_ref().context( - "Search mode requires OPENROUTER_API_KEY — OpenRouter provides unified web search via :online models", - )?; + let resolved = self.resolve_client(route, prompt.search_enabled)?; + + if resolved.search_enabled { + eprintln!( + "[search] {} -> OpenRouter :online model '{}'", + route.display_name, resolved.model + ); + } else if let Some(vendor_name) = resolved.fallback_from { + eprintln!( + "[openrouter] {} client not configured, falling back to OpenRouter for model '{}'", + vendor_name, resolved.model + ); + } + + resolved.client.generate(&resolved.model, prompt).await + } +} + +impl RouterProvider { + fn resolve_client<'a>(&'a self, route: &ModelRoute, search_enabled: bool) -> Result> { + if search_enabled { let base_model = route .openrouter_model .clone() .unwrap_or_else(|| openrouter_model_id(route.vendor, &route.api_model)); - let online_model = format!("{base_model}:online"); - eprintln!( - "[search] {} → OpenRouter :online model '{}'", - route.display_name, online_model - ); - return cli.generate(&online_model, prompt).await; + return self.resolve_openrouter(format!("{base_model}:online"), None, true); } let vendor = self.force.unwrap_or(route.vendor); - // If vendor is explicitly OpenRouter, or if the direct client isn't configured - // but OpenRouter is available, route through OpenRouter. if vendor == Vendor::OpenRouter { - let cli = self - .openrouter - .as_ref() - .context("OpenRouter client not configured (set OPENROUTER_API_KEY)")?; let model = route.openrouter_model.as_deref().unwrap_or(&route.api_model); - return cli.generate(model, prompt).await; + return self.resolve_openrouter(model.to_string(), None, false); } - // Try direct client first, fall back to OpenRouter if available. - match vendor { - Vendor::OpenAi => match self.openai.as_ref() { - Some(cli) => cli.generate(&route.api_model, prompt).await, - None => self.fallback_openrouter(route, prompt, "OpenAI").await, - }, - Vendor::Anthropic => match self.anthropic.as_ref() { - Some(cli) => cli.generate(&route.api_model, prompt).await, - None => self.fallback_openrouter(route, prompt, "Anthropic").await, - }, - Vendor::Google => match self.google.as_ref() { - Some(cli) => cli.generate(&route.api_model, prompt).await, - None => self.fallback_openrouter(route, prompt, "Google").await, - }, - Vendor::Xai => match self.xai.as_ref() { - Some(cli) => cli.generate(&route.api_model, prompt).await, - None => self.fallback_openrouter(route, prompt, "xAI").await, - }, - Vendor::DeepSeek => match self.deepseek.as_ref() { - Some(cli) => cli.generate(&route.api_model, prompt).await, - None => self.fallback_openrouter(route, prompt, "DeepSeek").await, - }, - Vendor::Meta => match self.meta.as_ref() { - Some(cli) => cli.generate(&route.api_model, prompt).await, - None => self.fallback_openrouter(route, prompt, "Meta").await, - }, - Vendor::OpenRouter => unreachable!("handled above"), - } + let direct = self.clients.get(&vendor).map(|client| client.as_ref()); + self.resolve_direct_or_openrouter(direct, route, vendor) } -} -impl RouterProvider { - /// Fall back to the OpenRouter client when a direct vendor client is not configured. - async fn fallback_openrouter( - &self, + fn resolve_direct_or_openrouter<'a>( + &'a self, + direct: Option<&'a dyn LlmClient>, route: &ModelRoute, - prompt: &BuiltPrompt, - vendor_name: &str, - ) -> Result { - match self.openrouter.as_ref() { - Some(cli) => { - let or_model = route - .openrouter_model - .clone() - .unwrap_or_else(|| openrouter_model_id(route.vendor, &route.api_model)); - eprintln!( - "[openrouter] {} client not configured, falling back to OpenRouter for model '{}'", - vendor_name, or_model - ); - cli.generate(&or_model, prompt).await - } - None => anyhow::bail!( - "{} client not configured and no OpenRouter fallback available. \ - Set {}_API_KEY or OPENROUTER_API_KEY.", - vendor_name, - vendor_name.to_ascii_uppercase() - ), + vendor: Vendor, + ) -> Result> { + if let Some(client) = direct { + return Ok(ResolvedClient { + client, + endpoint_name: vendor.display_name(), + model: route.api_model.clone(), + fallback_from: None, + search_enabled: false, + }); } + + let model = route + .openrouter_model + .clone() + .unwrap_or_else(|| openrouter_model_id(route.vendor, &route.api_model)); + self.resolve_openrouter(model, Some(vendor.display_name()), false) + } + + fn resolve_openrouter<'a>( + &'a self, + model: String, + fallback_from: Option<&'static str>, + search_enabled: bool, + ) -> Result> { + let client = self + .clients + .get(&Vendor::OpenRouter) + .map(|client| client.as_ref()) + .context("OpenRouter client not configured (set OPENROUTER_API_KEY)")?; + + Ok(ResolvedClient { + client, + endpoint_name: "OpenRouter", + model, + fallback_from, + search_enabled, + }) } } diff --git a/tools/xtask-llm-benchmark/src/llm/segmentation.rs b/tools/xtask-llm-benchmark/src/llm/segmentation.rs index 2926852ada0..26bc481e52f 100644 --- a/tools/xtask-llm-benchmark/src/llm/segmentation.rs +++ b/tools/xtask-llm-benchmark/src/llm/segmentation.rs @@ -88,14 +88,29 @@ pub fn build_anthropic_messages( } // Provider-specific context limits -pub fn anthropic_ctx_limit_tokens(_model: &str) -> usize { - // Anthropic hard limit is 200k; reserve ~15k for tokenizer variance + system/segments +pub fn anthropic_ctx_limit_tokens(model: &str) -> usize { + let m = model.to_ascii_lowercase(); + + // Newer Claude 4.6+ models expose a 1M context window. + if m.contains("4-6") + || m.contains("4.6") + || m.contains("4-7") + || m.contains("4.7") + || m.contains("4-8") + || m.contains("4.8") + { + return 1_000_000; + } + + // Older Anthropic models are 200k; reserve ~15k for tokenizer variance + system/segments. 185_000 } pub fn openai_ctx_limit_tokens(model: &str) -> usize { let m = model.to_ascii_lowercase(); - if m.contains("gpt-5") || m.contains("gpt-4.1") { + if m.contains("gpt-5.5") { + 1_050_000 + } else if m.contains("gpt-5") || m.contains("gpt-4.1") { 400_000 } else { 128_000 @@ -105,7 +120,13 @@ pub fn openai_ctx_limit_tokens(model: &str) -> usize { pub fn deepseek_ctx_limit_tokens(model: &str) -> usize { let m = model.to_ascii_lowercase(); - // API limit 128K for deepseek-chat and deepseek-reasoner + if m.starts_with("deepseek-v4") { + return 1_000_000; + } + if m.starts_with("deepseek-v3.2") { + return 128_000; + } + // API limit 128K for deepseek-chat and deepseek-reasoner compatibility aliases. if m.starts_with("deepseek-reasoner") || m.starts_with("deepseek-r1") { return 128_000; } @@ -123,8 +144,8 @@ pub fn deepseek_ctx_limit_tokens(model: &str) -> usize { pub fn gemini_ctx_limit_tokens(model: &str) -> usize { let m = model.to_ascii_lowercase(); - // Gemini 2.5 series (very large) - if m.contains("2.5") && (m.contains("pro") || m.contains("flash")) { + // Gemini 3.x and 2.5 series (very large) + if (m.contains("3.") || m.contains("2.5")) && (m.contains("pro") || m.contains("flash")) { return 1_000_000; } @@ -160,9 +181,12 @@ pub fn meta_ctx_limit_tokens(model: &str) -> usize { pub fn xai_ctx_limit_tokens(model: &str) -> usize { let m = model.to_ascii_lowercase(); - if m.contains("grok-code-fast-1") { + if m.contains("grok-build-0.1") || m.contains("grok-code-fast-1") { return 256_000; } + if m.contains("grok-4.3") { + return 1_000_000; + } if m.contains("grok-4") || m.contains("grok-3") { return 128_000; } diff --git a/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj b/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj index ce04141c7a0..f286932badd 100644 --- a/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj +++ b/tools/xtask-llm-benchmark/src/templates/csharp/server/StdbModule.csproj @@ -1,9 +1,5 @@ - - - - net8.0 wasi-wasm @@ -12,9 +8,7 @@ - - - + - \ No newline at end of file +