Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
a6382ca
updates
bradleyshep Jun 10, 2026
711ff88
Update provider.rs
bradleyshep Jun 11, 2026
e82f0ae
updates
bradleyshep Jun 11, 2026
bcdb41d
preflight credit checks; workflow update to use web
bradleyshep Jun 12, 2026
f2179a2
weekly goldens; workflow refinements
bradleyshep Jun 12, 2026
8d1d27e
Update publishers.rs
bradleyshep Jun 12, 2026
d5957f2
golden fixes
bradleyshep Jun 12, 2026
f1ae445
Merge branch 'master' into bradley/fix-validate-goldens-ci
bradleyshep Jun 12, 2026
4c679e2
fixes
bradleyshep Jun 12, 2026
4358ed5
Update publishers.rs
bradleyshep Jun 12, 2026
890be18
updates
bradleyshep Jun 12, 2026
480cedf
Update publishers.rs
bradleyshep Jun 12, 2026
d4999e2
fixes
bradleyshep Jun 12, 2026
e58523f
Update publishers.rs
bradleyshep Jun 12, 2026
032afd1
fixes
bradleyshep Jun 13, 2026
9eee265
Merge branch 'master' into bradley/fix-validate-goldens-ci
bradleyshep Jun 13, 2026
6037418
match smoketest (fingers crossed?)
bradleyshep Jun 13, 2026
2e6e02f
fix
bradleyshep Jun 13, 2026
b2308b1
shrug
bradleyshep Jun 13, 2026
2b133b8
fix?
bradleyshep Jun 13, 2026
7857671
testing
bradleyshep Jun 13, 2026
ee38f7a
test
bradleyshep Jun 13, 2026
77e2924
Update llm-benchmark-periodic.yml
bradleyshep Jun 13, 2026
63a9c34
revert tests
bradleyshep Jun 13, 2026
9596077
preflight no error; vendor to openrouter in periodic
bradleyshep Jun 15, 2026
65e4539
lints
bradleyshep Jun 15, 2026
4272cfd
Merge branch 'master' into bradley/fix-validate-goldens-ci
bradleyshep Jun 15, 2026
23ab3f7
Avoid .NET globalization crash in LLM benchmarks (#5335)
clockwork-labs-bot Jun 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 106 additions & 15 deletions .github/workflows/llm-benchmark-periodic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,23 @@ name: Periodic LLM benchmarks

on:
schedule:
# Daily at midnight UTC. Change to '0 */6 * * *' for every 6h,
# or '0 */4 * * *' for every 4h.
- cron: '0 0 * * *'
# Weekly on Monday at midnight UTC.
- cron: '0 0 * * 1'
workflow_dispatch:
inputs:
model_set:
description: 'Model set to run'
required: false
type: choice
options:
- website_active
- local_defaults
- explicit
default: website_active
models:
description: 'Models to run (provider:model format, comma-separated, or "all")'
description: 'Space-separated provider:model groups. Required when model_set=explicit.'
required: false
default: 'all'
default: ''
languages:
description: 'Languages to benchmark (comma-separated: rust,csharp,typescript)'
required: false
Expand All @@ -19,12 +27,24 @@ on:
description: 'Modes to run (comma-separated: guidelines,no_context,docs,...)'
required: false
default: 'guidelines,no_context'
categories:
description: 'Optional benchmark categories to run (comma-separated)'
required: false
default: ''
tasks:
description: 'Optional benchmark task ids/selectors to run (comma-separated)'
required: false
default: ''
dry_run:
description: 'Run benchmarks without uploading results'
required: false
default: 'false'

permissions:
contents: read

concurrency:
group: llm-benchmark-periodic
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
cancel-in-progress: true

jobs:
Expand All @@ -33,10 +53,9 @@ jobs:
timeout-minutes: 180

steps:
- name: Checkout master
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: master
fetch-depth: 1

- uses: dtolnay/rust-toolchain@stable
Expand All @@ -45,7 +64,7 @@ jobs:
- name: Setup .NET SDK
uses: actions/setup-dotnet@v4
with:
dotnet-version: "8.0.x"
global-json-file: global.json

- name: Install WASI workload
env:
Expand All @@ -55,13 +74,28 @@ jobs:
run: |
dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel

- name: Pack C# runtime packages
if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'csharp') }}
run: |
dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime
dotnet pack -c Release crates/bindings-csharp/Runtime

- name: Set up Node.js
if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }}
uses: actions/setup-node@v4
with:
node-version: 22

- name: Install pnpm
if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }}
uses: ./.github/actions/setup-pnpm
with:
run_install: true

- name: Build TypeScript SDK
if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }}
run: pnpm build
working-directory: crates/bindings-typescript

- name: Build llm-benchmark tool
run: cargo install --path tools/xtask-llm-benchmark --locked
Expand All @@ -78,30 +112,87 @@ jobs:
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
LLM_VENDOR: openrouter
LLM_BENCHMARK_API_KEY: ${{ secrets.LLM_BENCHMARK_API_KEY }}
LLM_BENCHMARK_UPLOAD_URL: ${{ secrets.LLM_BENCHMARK_UPLOAD_URL }}
DOTNET_MULTILEVEL_LOOKUP: "0"
DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home
DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1"
MSBUILDDISABLENODEREUSE: "1"
DOTNET_CLI_USE_MSBUILD_SERVER: "0"
LLM_BENCH_CSHARP_CONCURRENCY: "1"
INPUT_LANGUAGES: ${{ inputs.languages || 'rust,csharp,typescript' }}
INPUT_MODELS: ${{ inputs.models || 'all' }}
INPUT_MODEL_SET: ${{ inputs.model_set || 'website_active' }}
INPUT_MODELS: ${{ inputs.models || '' }}
INPUT_MODES: ${{ inputs.modes || 'guidelines,no_context' }}
INPUT_CATEGORIES: ${{ inputs.categories || '' }}
INPUT_TASKS: ${{ inputs.tasks || '' }}
INPUT_DRY_RUN: ${{ inputs.dry_run || 'false' }}
run: |
LANGS="$INPUT_LANGUAGES"
MODEL_SET="$INPUT_MODEL_SET"
MODELS="$INPUT_MODELS"
MODES="$INPUT_MODES"
CATEGORIES="$INPUT_CATEGORIES"
TASKS="$INPUT_TASKS"
DRY_RUN="$INPUT_DRY_RUN"

case "$MODEL_SET" in
website_active)
if [ -n "$MODELS" ]; then
echo "::error::models is only valid when model_set=explicit"
exit 1
fi
;;
local_defaults)
if [ -n "$MODELS" ]; then
echo "::error::models is only valid when model_set=explicit"
exit 1
fi
;;
explicit)
if [ -z "$MODELS" ]; then
echo "::error::models is required when model_set=explicit"
exit 1
fi
read -r -a MODEL_ARGS <<< "$MODELS"
;;
*)
echo "::error::unknown model_set '$MODEL_SET' (expected website_active, local_defaults, or explicit)"
exit 1
;;
esac

SUCCEEDED=0
FAILED=0
for LANG in $(echo "$LANGS" | tr ',' ' '); do
if [ "$MODELS" = "all" ]; then
if llm_benchmark run --lang "$LANG" --modes "$MODES"; then
EXTRA_ARGS=()
if [ -n "$CATEGORIES" ]; then
EXTRA_ARGS+=(--categories "$CATEGORIES")
fi
if [ -n "$TASKS" ]; then
EXTRA_ARGS+=(--tasks "$TASKS")
fi
if [ "$DRY_RUN" = "true" ]; then
EXTRA_ARGS+=(--dry-run)
fi

if [ "$MODEL_SET" = "website_active" ]; then
if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote "${EXTRA_ARGS[@]}"; then
SUCCEEDED=$((SUCCEEDED + 1))
else
echo "::warning::Benchmark run failed for lang=$LANG"
FAILED=$((FAILED + 1))
fi
elif [ "$MODEL_SET" = "local_defaults" ]; then
if llm_benchmark run --lang "$LANG" --modes "$MODES" "${EXTRA_ARGS[@]}"; then
SUCCEEDED=$((SUCCEEDED + 1))
else
echo "::warning::Benchmark run failed for lang=$LANG"
FAILED=$((FAILED + 1))
fi
else
if llm_benchmark run --lang "$LANG" --modes "$MODES" --models "$MODELS"; then
if llm_benchmark run --lang "$LANG" --modes "$MODES" --models "${MODEL_ARGS[@]}" "${EXTRA_ARGS[@]}"; then
SUCCEEDED=$((SUCCEEDED + 1))
else
echo "::warning::Benchmark run failed for lang=$LANG models=$MODELS"
Expand All @@ -110,7 +201,7 @@ jobs:
fi
done
echo "Benchmark runs: $SUCCEEDED succeeded, $FAILED failed"
if [ "$SUCCEEDED" -eq 0 ] && [ "$FAILED" -gt 0 ]; then
echo "::error::All benchmark runs failed"
if [ "$FAILED" -gt 0 ]; then
echo "::error::$FAILED benchmark run(s) failed"
exit 1
fi
43 changes: 35 additions & 8 deletions .github/workflows/llm-benchmark-validate-goldens.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,26 @@ name: Validate LLM benchmark golden answers

on:
schedule:
# Nightly at 2 AM UTC
- cron: '0 2 * * *'
workflow_dispatch: {}
# Weekly on Monday at 2 AM UTC.
- cron: '0 2 * * 1'
workflow_dispatch:
inputs:
lang:
description: 'Language to validate for manual smoke runs'
required: false
type: choice
default: all
options:
- all
- rust
- csharp
- typescript

permissions:
contents: read

concurrency:
group: llm-benchmark-validate-goldens
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
cancel-in-progress: true

jobs:
Expand All @@ -21,13 +32,12 @@ jobs:
strategy:
fail-fast: false
matrix:
lang: [rust, csharp, typescript]
lang: ${{ fromJSON(github.event_name == 'workflow_dispatch' && inputs.lang != 'all' && format('["{0}"]', inputs.lang) || '["rust","csharp","typescript"]') }}

steps:
- name: Checkout master
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: master
fetch-depth: 1

- uses: dtolnay/rust-toolchain@stable
Expand All @@ -37,7 +47,7 @@ jobs:
if: matrix.lang == 'csharp'
uses: actions/setup-dotnet@v4
with:
dotnet-version: "8.0.x"
global-json-file: global.json

- name: Install WASI workload
if: matrix.lang == 'csharp'
Expand All @@ -48,6 +58,12 @@ jobs:
run: |
dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel

- name: Pack C# runtime packages
if: matrix.lang == 'csharp'
run: |
dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime
dotnet pack -c Release crates/bindings-csharp/Runtime

- name: Set up Node.js
if: matrix.lang == 'typescript'
uses: actions/setup-node@v4
Expand All @@ -57,6 +73,13 @@ jobs:
- name: Install pnpm
if: matrix.lang == 'typescript'
uses: ./.github/actions/setup-pnpm
with:
run_install: true

- name: Build TypeScript SDK
if: matrix.lang == 'typescript'
run: pnpm build
working-directory: crates/bindings-typescript

- name: Build llm-benchmark tool
run: cargo install --path tools/xtask-llm-benchmark --locked
Expand All @@ -70,7 +93,11 @@ jobs:

- name: Validate golden answers (${{ matrix.lang }})
env:
DOTNET_MULTILEVEL_LOOKUP: "0"
DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home
DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1"
MSBUILDDISABLENODEREUSE: "1"
DOTNET_CLI_USE_MSBUILD_SERVER: "0"
LLM_BENCH_CSHARP_CONCURRENCY: "1"
run: |
llm_benchmark run --goldens-only --lang ${{ matrix.lang }}
Loading
Loading